In [61]:
import pandas as pd

In [12]:
hard = pd.read_csv('gpt-3.5-turbo-0125/MathQA_easy.csv')

In [13]:
hard = hard.dropna(axis=1, how='all')

In [14]:
hard[hard['Final Answer_11'].isna()]['Final Answer_11']

Series([], Name: Final Answer_11, dtype: object)

In [15]:
hard['CoT_0'].iloc[0]

'\r\nStep 1: To find the number of ways the test can be completed if every question is unanswered, we need to calculate the total number of ways a student can choose an answer for each question.\r\nStep 2: Since each question has 5 answer choices, there are 5 ways to answer each question.\r\nStep 3: As there are 4 questions in total, the total number of ways to complete the test is calculated by multiplying the number of ways for each question: 5 * 5 * 5 * 5 = 625.\r\nStep 4: Therefore, the total number of ways the test can be completed is 625.'

In [17]:
import time
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
import Levenshtein


In [18]:
methods_list = ['cosine','jaccard','euclidean','levenshtein']
def calculate_similarity(method, sentence1, sentence2):
    start_time = time.time()
    similarity = None

    # Preprocess the sentences to lowercase
    sentence1 = sentence1.lower()
    sentence2 = sentence2.lower()
    if method == 'cosine':
        vectorizer = TfidfVectorizer().fit([sentence1, sentence2])
        vector1, vector2 = vectorizer.transform([sentence1, sentence2])
        similarity = cosine_similarity(vector1, vector2)[0, 0]

    elif method == 'jaccard':
        set1 = set(ngrams(sentence1, n=2))
        set2 = set(ngrams(sentence2, n=2))
        similarity = 1 - jaccard_distance(set1, set2)

    elif method == 'euclidean':
        vectorizer = TfidfVectorizer().fit([sentence1, sentence2])
        vector1, vector2 = vectorizer.transform([sentence1, sentence2])
        similarity = 1 / (1 + euclidean_distances(vector1, vector2)[0, 0])

    elif method == 'levenshtein':
        similarity = 1 - (Levenshtein.distance(sentence1, sentence2) / max(len(sentence1), len(sentence2)))

    elapsed_time = time.time() - start_time
    print(f"Time to compute {method} similarity: {elapsed_time:.6f} seconds")
    return similarity,elapsed_time

# Example usage:
methods_time = {}
for method in methods_list:
    t = 0
    for j in range(100):
        similarity_score,ti = calculate_similarity(method, hard['CoT_0'].iloc[j], hard['CoT_1'].iloc[j])
        print(f"Sample {j} : ")
        print(f"{method}: {similarity_score}")
        t+=ti
    methods_time[method] = t


Time to compute cosine similarity: 0.010051 seconds
Sample 0 : 
cosine: 0.9328873009191496
Time to compute cosine similarity: 0.001998 seconds
Sample 1 : 
cosine: 0.9854320572710188
Time to compute cosine similarity: 0.000998 seconds
Sample 2 : 
cosine: 0.9492959447388756
Time to compute cosine similarity: 0.000998 seconds
Sample 3 : 
cosine: 0.8413211361245543
Time to compute cosine similarity: 0.000996 seconds
Sample 4 : 
cosine: 0.8870075277640148
Time to compute cosine similarity: 0.001515 seconds
Sample 5 : 
cosine: 0.9070226270376069
Time to compute cosine similarity: 0.002063 seconds
Sample 6 : 
cosine: 0.913022000898569
Time to compute cosine similarity: 0.001999 seconds
Sample 7 : 
cosine: 0.8958546117662258
Time to compute cosine similarity: 0.001003 seconds
Sample 8 : 
cosine: 0.9865116384194739
Time to compute cosine similarity: 0.000567 seconds
Sample 9 : 
cosine: 0.7240104026698195
Time to compute cosine similarity: 0.002013 seconds
Sample 10 : 
cosine: 0.8030109749051572

In [25]:
import pandas as pd
from functools import reduce

def aggregate_sentences(sentences):
    return ' '.join(sentences)

def calculate_similarity_with_aggregation(df, method='levenshtein'):
    similarities = []
    aggregated_sentence = ''

    for i in range(39):
        current_column = f'CoT_{i}'
        next_column = f'CoT_{i+1}'

        current_sentence = df[current_column].iloc[0]
        next_sentence = df[next_column].iloc[0]

        aggregated_sentence += current_sentence + ' '
        similarity = calculate_similarity(method, aggregated_sentence, next_sentence)
        similarities.append(similarity[0])

    return similarities

# Example usage:
# Assuming your DataFrame is named 'df' and has columns CoT_0 to CoT_39
similarities = calculate_similarity_with_aggregation(hard)
print("Similarities:", similarities)

Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000970 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000000 seconds
Time to compute levenshtein similarity: 0.000998 seconds
Time to compute levenshtein sim

In [26]:
similarities = calculate_similarity_with_aggregation(hard, method='cosine')
print("Similarities:", similarities)

Time to compute cosine similarity: 0.002806 seconds
Time to compute cosine similarity: 0.001994 seconds
Time to compute cosine similarity: 0.001028 seconds
Time to compute cosine similarity: 0.001991 seconds
Time to compute cosine similarity: 0.001002 seconds
Time to compute cosine similarity: 0.001557 seconds
Time to compute cosine similarity: 0.002064 seconds
Time to compute cosine similarity: 0.001989 seconds
Time to compute cosine similarity: 0.002011 seconds
Time to compute cosine similarity: 0.000994 seconds
Time to compute cosine similarity: 0.001998 seconds
Time to compute cosine similarity: 0.001019 seconds
Time to compute cosine similarity: 0.001988 seconds
Time to compute cosine similarity: 0.002003 seconds
Time to compute cosine similarity: 0.002620 seconds
Time to compute cosine similarity: 0.001998 seconds
Time to compute cosine similarity: 0.002000 seconds
Time to compute cosine similarity: 0.002007 seconds
Time to compute cosine similarity: 0.002963 seconds
Time to comp

In [27]:
similarities = calculate_similarity_with_aggregation(hard, method='euclidean')
print("Similarities:", similarities)

Time to compute euclidean similarity: 0.002000 seconds
Time to compute euclidean similarity: 0.001131 seconds
Time to compute euclidean similarity: 0.000871 seconds
Time to compute euclidean similarity: 0.001999 seconds
Time to compute euclidean similarity: 0.002001 seconds
Time to compute euclidean similarity: 0.002000 seconds
Time to compute euclidean similarity: 0.001999 seconds
Time to compute euclidean similarity: 0.001508 seconds
Time to compute euclidean similarity: 0.002011 seconds
Time to compute euclidean similarity: 0.002000 seconds
Time to compute euclidean similarity: 0.001000 seconds
Time to compute euclidean similarity: 0.001999 seconds
Time to compute euclidean similarity: 0.002000 seconds
Time to compute euclidean similarity: 0.002000 seconds
Time to compute euclidean similarity: 0.001999 seconds
Time to compute euclidean similarity: 0.002000 seconds
Time to compute euclidean similarity: 0.002000 seconds
Time to compute euclidean similarity: 0.002028 seconds
Time to co

In [28]:
similarities = calculate_similarity_with_aggregation(hard, method='jaccard')
print("Similarities:", similarities)

Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000998 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.001004 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000997 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.001002 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000998 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.001103 s

In [29]:
import pandas as pd
from functools import reduce

def aggregate_sentences(sentences):
    return ' '.join(sentences)

def calculate_similarity_with_aggregation(df, method='jaccard'):
    similarities_df = pd.DataFrame(columns=[f'CoT_{i+1}_Similarity' for i in range(39)])

    for index, row in df.iterrows():
        similarities = []
        aggregated_sentence = ''

        for i in range(39):
            current_column = f'CoT_{i}'
            next_column = f'CoT_{i+1}'

            current_sentence = row[current_column]
            next_sentence = row[next_column]

            aggregated_sentence += current_sentence + ' '
            similarity = calculate_similarity(method, aggregated_sentence, next_sentence)
            similarities.append(similarity[0])

        similarities_df.loc[index] = similarities

    return similarities_df

# Example usage:
# Assuming your DataFrame is named 'df' and has columns CoT_0 to CoT_39
similarities_df = calculate_similarity_with_aggregation(hard)
print("Similarity DataFrame:")
print(similarities_df)

Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.001000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000997 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.001060 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000989 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.000000 seconds
Time to compute jaccard similarity: 0.001027 seconds
Time to compute jaccard similarity: 0.000976 seconds
Time to compute jaccard similarity: 0.000000 s

In [30]:
similarities_df

Unnamed: 0,CoT_1_Similarity,CoT_2_Similarity,CoT_3_Similarity,CoT_4_Similarity,CoT_5_Similarity,CoT_6_Similarity,CoT_7_Similarity,CoT_8_Similarity,CoT_9_Similarity,CoT_10_Similarity,...,CoT_30_Similarity,CoT_31_Similarity,CoT_32_Similarity,CoT_33_Similarity,CoT_34_Similarity,CoT_35_Similarity,CoT_36_Similarity,CoT_37_Similarity,CoT_38_Similarity,CoT_39_Similarity
0,0.952703,0.934211,0.953947,0.954545,0.941558,0.923567,0.923567,0.923567,0.923567,0.923567,...,0.847953,0.836257,0.830409,0.847953,0.847953,0.847953,0.847953,0.847953,0.847953,0.836257
1,0.891667,0.845528,0.518182,0.506787,0.780591,0.447257,0.447257,0.447257,0.768340,0.411538,...,0.393502,0.646497,0.633846,0.326154,0.593846,0.326154,0.593846,0.604294,0.334356,0.334356
2,0.857143,0.706161,0.674208,0.714932,0.675676,0.650655,0.676856,0.694323,0.528571,0.567857,...,0.441989,0.417582,0.425824,0.423077,0.412088,0.425824,0.423077,0.454054,0.429730,0.416216
3,0.663755,0.657143,0.671937,0.611511,0.565657,0.635762,0.487261,0.525478,0.636943,0.643312,...,0.471429,0.521368,0.561453,0.528610,0.550409,0.425134,0.571809,0.545213,0.453581,0.485411
4,0.517928,0.691120,0.568027,0.498328,0.545752,0.514563,0.477341,0.539157,0.480480,0.537356,...,0.350000,0.397826,0.409483,0.248394,0.385928,0.381356,0.357143,0.386555,0.342437,0.415638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.656051,0.717647,0.748634,0.624490,0.530612,0.538776,0.522449,0.740458,0.463878,0.782772,...,0.524272,0.469841,0.473016,0.440252,0.669753,0.685185,0.422840,0.401235,0.432099,0.651235
496,0.678363,0.745856,0.701031,0.702564,0.764103,0.651282,0.753846,0.671795,0.680203,0.568528,...,0.665158,0.656109,0.674208,0.684685,0.608108,0.671171,0.504505,0.675676,0.567568,0.772321
497,0.677852,0.394904,0.527363,0.624434,0.523013,0.640927,0.526923,0.679715,0.423488,0.563380,...,0.462908,0.391691,0.486804,0.376812,0.428986,0.344928,0.343931,0.447977,0.492795,0.540000
498,0.828571,0.767606,0.006897,0.870748,0.027027,0.791411,0.871951,0.024390,0.806061,0.872727,...,0.670391,0.720670,0.720670,0.765363,0.720670,0.804469,0.720670,0.720670,0.715084,0.720670
