In [None]:
import ast
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import umap
from sklearn.manifold import TSNE
from scipy.spatial import distance
import warnings
import re
import random
warnings.filterwarnings("ignore")

In [None]:
import importlib
import embedding_plotting
import trace_utils
importlib.reload(embedding_plotting)
importlib.reload(trace_utils)


from embedding_plotting import DimensionalViz
from embedding_plotting import cot_step_print
from trace_utils import split_cot
from trace_utils import compute_metrics

In [None]:
gsm_df = pd.read_csv('data/cot_data_50_start_sentences_autosplit.csv')
gsm_df['CoT Sentences'] = gsm_df['CoT Sentences'].apply(split_cot)

# gsm_df = pd.read_csv('data/cot_data_50_start_sentences_sentsplit.csv')
# gsm_df['CoT Sentences'] = gsm_df['CoT Sentences'].apply(ast.literal_eval)

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
gsm_df.head(1)

In [None]:
gsm_df['CoT Length'] = gsm_df['CoT Sentences'].apply(len)

In [None]:
import matplotlib.pyplot as plt
import random

def plot_cot_step_metrics(cot_collection):
    """
    Plots metric sequence for all inputted sentences
    """
    embeddings_collection = []
    metrics_adjacent_collection = []
    metrics_random_collection = []
    for trace in cot_collection:
        embeddings = [model.encode(step) for step in trace]
        embeddings_collection.append(embeddings)
        metrics_adjacent = []
        metrics_random = []
        for i in range(len(embeddings) - 1):
            emb1 = embeddings[i]
            emb2 = embeddings[i + 1]

            # Compute metric for adjacent steps
            metric_adjacent = distance.euclidean(emb1, emb2)
            metrics_adjacent.append(metric_adjacent)

            # Compute metric for random steps
            other_embeddings = embeddings[:i] + embeddings[i+2:]  # Exclude the current and next step's embeddings
            emb_random = random.choice(other_embeddings)
            metric_random = distance.euclidean(emb1, emb_random)
            metrics_random.append(metric_random)

        metrics_adjacent_collection.append(metrics_adjacent)
        metrics_random_collection.append(metrics_random)

    # Plotting
    plt.figure(figsize=(10, 6))
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']  # List of colors for the traces
    for i, (metrics_adjacent, metrics_random) in enumerate(zip(metrics_adjacent_collection, metrics_random_collection)):
        indices = list(range(len(metrics_adjacent)))
        color = colors[i % len(colors)]  # Select a color from the list
        plt.plot(indices, metrics_adjacent, label=f'Trace {i+1} (Adjacent)', color=color)
        plt.plot(indices, metrics_random, label=f'Trace {i+1} (Random)', linestyle='dashed', color=color)

    plt.xlabel('Step Index')
    plt.ylabel('Euclidean Distance')
    plt.title('Euclidean Distance between Successive and Random Embeddings for Each Trace')
    plt.legend()
    plt.show()

In [None]:
cot_collection_1 = []
cot_collection_1.append(['Betty sold Apples to 48 of her friends in April, and then she sold half as many Apples in May. How many Apples did Betty sell altogether in April and May?',
 "First, let's calculate how many Apples Betty sold in May. We know that she sold half as many Apples in May as she did in April. So, we can find this by dividing the number of Apples she sold in April by 2. 48 Apples / 2 = 24 Apples Therefore, Betty sold 24 Apples in May.",
 "Now, let's calculate how many Apples she sold altogether in April and May. We can find this by adding the number of Apples she sold in April and the number of Apples she sold in May. 48 Apples + 24 Apples = 72 Apples",
 'Therefore, Betty sold a total of 72 Apples in April and May.'])

In [None]:
plot_cot_step_metrics(cot_collection_1)

In [None]:
split_cot(cot_collection[0])

In [None]:
type(sampled_df['CoT Sentences'].tolist())

In [None]:
sampled_df['CoT Sentences'].iloc[0]

In [None]:
sampled_df = gsm_df.sample(3)
cot_collection = []
for i, trace in enumerate(sampled_df['CoT Sentences'].tolist()):
    first_three_words = " ".join(sampled_df['Prompt'].iloc[i].split()[:3])
    print("Prompts:")
    print(f"Trace {i + 1}: {first_three_words}...")
    # cot_collection.append(split_cot(trace))
    cot_collection.append(trace)
plot_cot_step_metrics(cot_collection)

In [None]:
gsm_df[gsm_df['Prompt'].str.startswith('A deep-sea monster')].index[0]

In [None]:
gsm_df.iloc[10]['Answer']

In [None]:
for i, step in enumerate(gsm_df.iloc[10]['CoT Sentences']):
    print(f"Step {i}: ", step)

Aggregating Metrics Stats

In [None]:
embeddings_list = []
cot_steps_list = []
original_text_list = []
metrics_list = []

for i in range(len(gsm_df)):
    question = gsm_df['Prompt'][i]
    cot_reasoning = gsm_df['CoT Sentences'][i]
    cot_reasoning.insert(0, question)
    cot_steps_list.append(cot_reasoning)
    cot_embeddings = model.encode(cot_reasoning)
    
    embeddings_list.append(cot_embeddings)
    original_text_list.append(cot_reasoning)
    metrics_list.append(compute_metrics(cot_embeddings))

# Convert lists to DataFrame
metrics_df = pd.DataFrame(metrics_list)
metrics_df['Original_Text'] = original_text_list
metrics_df['QnA_Steps'] = cot_steps_list
metrics_df['Embeddings'] = embeddings_list

In [None]:
metrics_df.head(2)

In [None]:
average_metrics = {}

columns_to_average = ['cosine_similarity', 'euclidean_distance', 'manhattan_distance', 'chebyshev_distance', 'euclidean_distance_normalized',
                      'random_cosine_similarity', 'random_euclidean_distance', 'random_manhattan_distance', 
                      'random_chebyshev_distance', 'random_euclidean_distance_normalized']

for col in columns_to_average:
    all_values = sum(metrics_df[col].tolist(), [])
    average_metrics[col] = np.mean(all_values)

average_metrics

In [None]:
sorted_value_counts = gsm_df['CoT Length'].value_counts().sort_index()
print(sorted_value_counts)

In [None]:
# metric = 'cosine_similarity'
metric = 'euclidean_distance'
# metric = 'manhattan_distance'
# metric = 'chebyshev_distance'
# metric = 'euclidean_distance_normalized'

length = max(gsm_df['CoT Length'])

# Initialize arrays to store sum and count for each index (up to 20)
sum_sequential = np.zeros(length)
count_sequential = np.zeros(length)
sum_random = np.zeros(length)
count_random = np.zeros(length)

# Iterate through each row to accumulate sums and counts
for index, row in metrics_df.iterrows():
    # Sequential metrics
    for i, value in enumerate(row[metric][:length]):
        if not np.isnan(value):
            sum_sequential[i] += value
            count_sequential[i] += 1
    
    # Random metrics
    for i, value in enumerate(row['random_' + metric][:length]):
        if not np.isnan(value):
            sum_random[i] += value
            count_random[i] += 1

# Calculate averages, avoiding division by zero
avg_sequential = [sum_sequential[i] / count_sequential[i] if count_sequential[i] != 0 else 0 for i in range(length)]
avg_random = [sum_random[i] / count_random[i] if count_random[i] != 0 else 0 for i in range(length)]

# Generate bar graph
indices = np.arange(length)
bar_width = 0.35

fig, ax = plt.subplots()
bar1 = ax.bar(indices - bar_width/2, avg_sequential, bar_width, label='Sequential')
bar2 = ax.bar(indices + bar_width/2, avg_random, bar_width, label='Random')

ax.set_xlabel('Index')
ax.set_ylabel(f'Average {metric.capitalize()}')
ax.set_title(f'Average {metric.capitalize()} by Index for Sequential and Random Metrics')
ax.set_xticks(indices)
ax.set_xticklabels([str(i) for i in range(length)])
ax.legend()

plt.show()

In [None]:
# metric = 'cosine_similarity'
metric = 'euclidean_distance'
# metric = 'manhattan_distance'
# metric = 'chebyshev_distance'
# metric = 'euclidean_distance_normalized'

# Initialize lists to store normalized indices and corresponding metric values
sequential_normalized_indices = []
sequential_metric_values = []
random_normalized_indices = []
random_metric_values = []

# Iterate through each row to accumulate data points
for index, row in metrics_df.iterrows():
    trace_length = len(row['QnA_Steps']) - 1  # Exclude the question 

    # Sequential metrics
    for i, value in enumerate(row[metric][:trace_length]):
        if not np.isnan(value):
            normalized_index = i / (trace_length)
            sequential_normalized_indices.append(normalized_index)
            sequential_metric_values.append(value)

    # Random metrics
    for i, value in enumerate(row['random_' + metric][:trace_length]):
        if not np.isnan(value):
            normalized_index = i / (trace_length)
            random_normalized_indices.append(normalized_index)
            random_metric_values.append(value)

plt.scatter(sequential_normalized_indices, sequential_metric_values, color='blue', label='Sequential')
plt.scatter(random_normalized_indices, random_metric_values, color='red', label='Random')

plt.xlabel('Normalized Index')
plt.ylabel(f'{metric.capitalize()}')
plt.title(f'{metric.capitalize()} for Sequential and Random Metrics')
plt.legend()
plt.show()

In [None]:
# Convert lists to DataFrame for easier manipulation
data_sequential = pd.DataFrame({
    'normalized_index': sequential_normalized_indices,
    'metric_value': sequential_metric_values
})
data_random = pd.DataFrame({
    'normalized_index': random_normalized_indices,
    'metric_value': random_metric_values
})

# Calculate the average metric value for each normalized index
avg_sequential = data_sequential.groupby('normalized_index').mean().reset_index()
avg_random = data_random.groupby('normalized_index').mean().reset_index()

plt.figure(figsize=(10, 6))

plt.scatter(data_sequential['normalized_index'], data_sequential['metric_value'], alpha=0.1, color='blue')
plt.scatter(data_random['normalized_index'], data_random['metric_value'], alpha=0.1, color='red')

# Plot the trend line
plt.plot(avg_sequential['normalized_index'], avg_sequential['metric_value'], color='blue', label='Sequential Average')
plt.plot(avg_random['normalized_index'], avg_random['metric_value'], color='red', label='Random Average')

plt.xlabel('Normalized Index')
plt.ylabel(f'Average {metric.capitalize()}')
plt.title(f'{metric.capitalize()} for Sequential and Random Metrics')
plt.legend()
plt.show()

In [None]:
def normalize_metrics(metrics):
    min_val = min(metrics)
    max_val = max(metrics)
    range_val = max_val - min_val
    if range_val > 0:
        normalized_metrics = [(m - min_val) / range_val for m in metrics]
    else:
        normalized_metrics = [0 for _ in metrics]
    return normalized_metrics

metrics_df['euclidean_distance_normalized'] = metrics_df['euclidean_distance'].apply(normalize_metrics)
metrics_df['random_euclidean_distance_normalized'] = metrics_df['random_' + 'euclidean_distance'].apply(normalize_metrics)

In [None]:
euclidean_ranges = metrics_df['euclidean_distance'].apply(lambda x: max(x) - min(x))

plt.figure(figsize=(10, 6))
plt.hist(euclidean_ranges, bins=30, color='skyblue', edgecolor='black')
plt.title('Histogram of Euclidean Distance Ranges')
plt.xlabel('Range of Euclidean Distance')
plt.ylabel('Frequency')
plt.show()

In [None]:
euclidean_ranges.describe()

In [None]:
# Find the reasoning traces with a Euclidean distance range of zero
traces_with_zero_range = metrics_df[metrics_df['euclidean_distance'].apply(lambda x: max(x) - min(x)) == 0]

# Inspect the traces and their corresponding original text
traces_with_zero_range_info = traces_with_zero_range[['euclidean_distance', 'Original_Text', 'QnA_Steps']]

In [None]:
traces_with_zero_range_info

In [None]:
gsm_df.iloc[28]['CoT Sentences']

In [None]:
gsm_df.at[28, 'CoT Sentences'] = ['There are 5 houses on a street, and each of the first four houses has 3 gnomes in the garden. If there are a total of 20 gnomes on the street, how many gnomes does the fifth house have?',
 "1. Let's start by figuring out how many gnomes are in the first four houses combined. Each of the first four houses has 3 gnomes, so the total number of gnomes in those houses is 3 * 4 = 12.",
 "2. We know that the total number of gnomes on the street is 20. Since we already accounted for 12 gnomes in the first four houses, that means the fifth house must have the remaining gnomes. So the fifth house must have 20 - 12 = 8 gnomes.",
 "Therefore, the fifth house has 8 gnomes in its garden."]

In [None]:
len(gsm_df.iloc[28]['CoT Sentences'])