<a href="https://colab.research.google.com/github/wyattmccurdy12/Project-Part-2/blob/main/SimilarityMetrics_and_ModelEvaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# !git clone https://github.com/wyattmccurdy12/Project-Part-2.git # Done

Cloning into 'Project-Part-2'...
remote: Enumerating objects: 166, done.[K
remote: Counting objects: 100% (166/166), done.[K
remote: Compressing objects: 100% (110/110), done.[K
remote: Total 166 (delta 104), reused 105 (delta 53), pack-reused 0[K
Receiving objects: 100% (166/166), 277.23 KiB | 2.35 MiB/s, done.
Resolving deltas: 100% (104/104), done.


# New Section

In [1]:
import pandas as pd
import json
import torch
import os
# import torch
# from sentence_transformers import SentenceTransformer
from torch.nn.functional import cosine_similarity
from tqdm import tqdm
import numpy as np

In [4]:
# os.chdir('drive')
# os.chdir('MyDrive')
# os.chdir('Project-Part-2')
os.listdir()

['g_rels_consenso.csv',
 'g_qrels_majority_2.csv',
 'tabulated_cleaned_emotionfiltered_trec.csv',
 '.git',
 '.gitignore',
 'README.md',
 'Text Mining and Analytics-Project-Part 2.pdf',
 '__pycache__',
 'augmented_answer_sets.txt',
 'clef_credentials.json',
 'data_processing_utils.py',
 'main.py',
 'preprocess_data.py',
 'test.ipynb',
 'augmented_answer_sets.csv',
 'ranking_augmented_data.csv']

In [79]:
# Load tabulated cleaned emotions
clean_ef_data = pd.read_csv('tabulated_cleaned_emotionfiltered_trec.csv')
clean_ef_data.head()

Unnamed: 0,docid,PRE,TEXT,POST,polarity,self_ref
0,s_1287_153_9,,I mean what the hell bro.,,neg,1
1,s_1287_187_0,,"Yeah, crazy isn't it?",,neg,1
2,s_1287_204_0,,No :( sadly it doesn't have everything,,neg,1
3,s_1287_222_4,,I'm worried.,,neg,1
4,s_1287_240_1,,Better weapons and going against a weaker team...,,neg,1


In [80]:
# prompt: remove 'polarity', 'self_ref', 'PRE', 'POST' columns

clean_ef_data = clean_ef_data.drop(['polarity', 'self_ref', 'PRE', 'POST'], axis=1)
clean_ef_data.head()

Unnamed: 0,docid,TEXT
0,s_1287_153_9,I mean what the hell bro.
1,s_1287_187_0,"Yeah, crazy isn't it?"
2,s_1287_204_0,No :( sadly it doesn't have everything
3,s_1287_222_4,I'm worried.
4,s_1287_240_1,Better weapons and going against a weaker team...


In [81]:
def generate_answers_df(in_lines_file='augmented_answer_sets.txt'):
    # if os.path.exists(out_file_path):
    #     return pd.read_csv(out_file_path)

    questions = {
        i: {j: [] for j in range(1, 5)}
        for i in range(1, 22)
    }

    with open(in_lines_file, 'r') as f:
        lines = f.readlines()

    question_number = 0
    for line in lines:
        line = line.strip()
        if len(line) < 3:
            question_number = int(line)
            severity = 1
        else:
            questions[question_number][severity].append(line)
            severity += 1

    df_list = []
    for question_number in questions:
        for severity in questions[question_number]:
            for text in questions[question_number][severity]:
                df_list.append(pd.DataFrame({'Question': [question_number], 'Severity': [severity], 'Text': [text]}))
    df = pd.concat(df_list, ignore_index=True)

    return df

def process_augmented_data(in_lines_file, exploded_df_path):
    """
    This function loads the augmented answer sets from BDI, a
    and processes them, outputing a dataframe and csv.

    Parameters:
    in_lines_file (str): The path to the input file containing the augmented data.
    exploded_df_path (str): The path to the saved exploded dataframe.
    embeddings_path (str): The path to the saved embeddings.

    Returns:
    DataFrame: A pandas DataFrame containing the processed augmented data.
    """
    if os.path.exists(exploded_df_path):
        print("Loading exploded dataframe from disk...")
        aug_answers_df = pd.read_csv(exploded_df_path)
    else:
        print("Generating exploded augmented answers dataframe...")
        # Load the augmented data
        aug_answers_df = generate_answers_df(in_lines_file)

        # Split the answers into individual sentences
        aug_answers_df['Text'] = aug_answers_df['Text'].str.split(',')
        aug_answers_df = aug_answers_df.explode('Text')

        # Save the exploded dataframe and embeddings
        aug_answers_df.to_csv(exploded_df_path, index=False)
        print(f"Exploded dataframe saved to {exploded_df_path}.")
    print("Augmented answer sets processed.\n")
    return aug_answers_df

In [82]:
augmented_answers = process_augmented_data('augmented_answer_sets.txt', 'augmented_answer_sets.csv')
augmented_answers.head()

Loading exploded dataframe from disk...
Augmented answer sets processed.



Unnamed: 0,Question,Severity,Text
0,1,1,I do not feel sad
1,1,1,I am not experiencing sadness
2,1,1,I don’t feel down
3,1,1,I’m not feeling unhappy
4,1,1,I’m not feeling melancholy


So now we have **augmented_answers** coming in
as well as **clean_ef_data** coming in.

Now let's use our embedding processor to make embeddings for both.

In [83]:
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
from sentence_transformers import SentenceTransformer
from torch.nn.functional import cosine_similarity


In [12]:
class EmbeddingProcessor:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)


    def calculate_similarity_sum(self, input_text, aug_answers_df, df_column):
        """
        Calculates the similarity sum between the input text and each answer text in the given DataFrame column.

        Args:
            input_text (str): The input text to compare against.
            aug_answers_df (pandas.DataFrame): The DataFrame containing the answer texts.
            df_column (str): The column name in the DataFrame containing the answer texts.

        Returns:
            float: The sum of cosine similarity scores between the input text and each answer text.
        """
        inputs_1 = self.tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to(self.device)
        outputs_1 = self.model(**inputs_1)
        cs_sum = 0
        for answer_text in aug_answers_df[df_column]:
            inputs_2 = self.tokenizer(answer_text, return_tensors='pt', padding=True, truncation=True).to(self.device)
            outputs_2 = self.model(**inputs_2)
            cs = F.cosine_similarity(outputs_1.last_hidden_state.mean(dim=1),
                                     outputs_2.last_hidden_state.mean(dim=1))
            cs_sum += cs.item()
        return cs_sum


    def calculate_similarity_for_row(self, row, corresponding_answer):
        cs_sum = self.calculate_similarity_sum(row['TEXT'], corresponding_answer, 'Text')
        return cs_sum

    def similarity_sum_over_col(self, persons_and_emotions_df, augmented_exploded_df, question_num):
        """
        Calculates the similarity sum over a specific bdi query in the persons_and_emotions_df DataFrame.
        This function calculates the cosine similarity for each row in the dataframe for the specific question.

        Args:
            persons_and_emotions_df (DataFrame): The DataFrame containing persons and emotions data.
            augmented_exploded_df (DataFrame): The DataFrame containing augmented and exploded data.
            question_num (int): The question number for which the similarity sum is calculated.

        Returns:
            DataFrame: The updated persons_and_emotions_df DataFrame with the similarity sum column added.

        Raises:
            None
        """
        save_name = f'cosine_similarity_q{question_num}'
        if os.path.exists(save_name):
            persons_and_emotions_df = pd.read_csv(save_name)
        else:
            persons_and_emotions_df[f'SIM_{question_num}'] = ''
            corresponding_answer = augmented_exploded_df[(augmented_exploded_df['Question'] == question_num)]

            cs_sums = []
            for _, row in persons_and_emotions_df.iterrows():
                cs_sum = self.calculate_similarity_for_row(row, corresponding_answer)
                cs_sums.append(cs_sum)

            persons_and_emotions_df[f'SIM_{question_num}'] = cs_sums
            persons_and_emotions_df = persons_and_emotions_df.sort_values(by=f'SIM_{question_num}', ascending=False)
            persons_and_emotions_df.to_csv(save_name, index=False)
            print(f"Data saved to {save_name}.")
        return persons_and_emotions_df

In [13]:
ep = EmbeddingProcessor()

In [84]:
# Give scores to the input emotions dataframe
device = torch.device('cuda')
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
input_texts = list(clean_ef_data['TEXT'])

for col_idx in range(1, 22):
  col_name = f"max_cs_{col_idx}"
  clean_ef_data[col_name] = -1.0
  print("Processing column ", col_idx)


  aug_answers = augmented_answers[augmented_answers['Question'] == col_idx]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]
  right_inputs = [answer for answer in list(aug_answers['Text'])]
  right_embeddings = torch.tensor(model.encode(right_inputs)).to(device)


  for i, input_text in tqdm(enumerate(input_texts), total=len(input_texts)):

    left_embedding = torch.tensor(model.encode([input_text])).to(device)

    # Vectorized cosine similarity calculation
    similarities = cosine_similarity(left_embedding[:, None], right_embeddings)

    # Find max similarity
    max_similarity, _ = torch.max(similarities, dim=1)  # _ for unused index

    # Update clean_ef_data
    clean_ef_data.loc[i, col_name] = max_similarity.cpu().numpy()


  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  1


100%|██████████| 27981/27981 [04:08<00:00, 112.82it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  2


100%|██████████| 27981/27981 [04:07<00:00, 113.24it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  3


100%|██████████| 27981/27981 [04:07<00:00, 113.05it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  4


100%|██████████| 27981/27981 [04:10<00:00, 111.48it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  5


100%|██████████| 27981/27981 [04:07<00:00, 113.20it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  6


100%|██████████| 27981/27981 [04:11<00:00, 111.22it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  7


100%|██████████| 27981/27981 [04:08<00:00, 112.58it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  8


100%|██████████| 27981/27981 [04:29<00:00, 103.94it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  9


100%|██████████| 27981/27981 [04:10<00:00, 111.52it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  10


100%|██████████| 27981/27981 [04:13<00:00, 110.50it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  11


100%|██████████| 27981/27981 [04:14<00:00, 110.00it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  12


100%|██████████| 27981/27981 [04:16<00:00, 109.08it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  13


100%|██████████| 27981/27981 [04:15<00:00, 109.56it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  14


100%|██████████| 27981/27981 [04:32<00:00, 102.58it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  15


100%|██████████| 27981/27981 [04:14<00:00, 109.82it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  16


100%|██████████| 27981/27981 [04:14<00:00, 109.77it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  17


100%|██████████| 27981/27981 [04:13<00:00, 110.17it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  18


100%|██████████| 27981/27981 [04:19<00:00, 107.89it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  19


100%|██████████| 27981/27981 [04:12<00:00, 110.77it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  20


100%|██████████| 27981/27981 [04:11<00:00, 111.14it/s]
  aug_answers = aug_answers[augmented_answers['Severity'] > 1]


Processing column  21


100%|██████████| 27981/27981 [04:12<00:00, 110.72it/s]


In [5]:
# Save the clean ef data as ranking_data
# clean_ef_data.to_csv('ranking_augmented_data.csv')
clean_ef_data = pd.read_csv('ranking_augmented_data.csv')
clean_ef_data.head()

Unnamed: 0.1,Unnamed: 0,docid,TEXT,max_cs_1,max_cs_2,max_cs_3,max_cs_4,max_cs_5,max_cs_6,max_cs_7,...,max_cs_12,max_cs_13,max_cs_14,max_cs_15,max_cs_16,max_cs_17,max_cs_18,max_cs_19,max_cs_20,max_cs_21
0,0,s_1287_153_9,I mean what the hell bro.,0.986091,0.992475,0.991631,0.993185,0.995528,0.99444,0.994651,...,0.988832,0.995181,0.994774,0.989906,0.986639,0.995369,0.992893,0.998584,0.997561,0.995222
1,1,s_1287_187_0,"Yeah, crazy isn't it?",0.986091,0.992475,0.991631,0.993185,0.994686,0.99444,0.99465,...,0.988832,0.995181,0.994774,0.989905,0.986639,0.995369,0.992893,0.998584,0.997561,0.995222
2,2,s_1287_204_0,No :( sadly it doesn't have everything,0.986091,0.992475,0.991631,0.993185,0.995528,0.99444,0.994651,...,0.988832,0.995181,0.994774,0.989906,0.986639,0.995369,0.992893,0.998584,0.997561,0.995222
3,3,s_1287_222_4,I'm worried.,0.986091,0.992475,0.991631,0.993185,0.995528,0.99444,0.99465,...,0.988832,0.995181,0.994774,0.989905,0.986639,0.995369,0.992893,0.998584,0.997561,0.995222
4,4,s_1287_240_1,Better weapons and going against a weaker team...,0.986091,0.992475,0.991631,0.993185,0.995528,0.99444,0.994651,...,0.988832,0.995181,0.994774,0.989906,0.986639,0.995369,0.992893,0.998584,0.997561,0.995222


In [6]:
consensus_labels = pd.read_csv("g_rels_consenso.csv")
consensus_labels.head()

Unnamed: 0,query,q0,docid,rel
0,1,0,s_405_1279_15,1
1,1,0,s_2519_356_0,0
2,1,0,s_2038_51_7,1
3,1,0,s_975_61_2,0
4,1,0,s_577_923_1,1


In [7]:
def assign_correct_class(row):
    if pd.isna(row['TEXT']):
        if row['rel'] == 1:
            return 0
        else:
            return 1
    else:
        if row['rel'] == 1:
            return 1
        else:
            return 0

In [28]:
!pip install torcheval
from sklearn.metrics import average_precision_score
from torcheval.metrics.functional import retrieval_precision

Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torcheval
Successfully installed torcheval-0.0.7


In [32]:
def calculate_metrics(merged_df, ranking_column):
    """
    Calculates relevant metrics for the given pandas DataFrame.

    Args:
       merged_df (pandas.DataFrame): The DataFrame containing necessary columns.
       ranking_column (str, optional): The name of the column containing ranking scores.

    Returns:
       A dictionary containing calculated metric scores.
    """

    # Calculate metrics
    metrics = {}

    # Precision@10
    metrics['precision_at_10'] = merged_df['correct'].iloc[:10].sum() / 10

    # R-Precision with dropped NaN values
    # merged_df_for_r = merged_df.dropna(subset=[ranking_column])
    input = torch.tensor(list(merged_df[ranking_column]))
    target = torch.tensor(list(merged_df['rel']))
    metrics['r_precision'] = float(retrieval_precision(input, target))

    # Average Precision
    # ones_array = np.ones_like(merged_df['correct'])
    metrics['average_precision'] = average_precision_score(merged_df['pred_rel'], merged_df['rel'])

    # Uncomment for NDGC@1000:
    # from sklearn.metrics import ndcg_score
    # metrics['ndcg_1000'] = ndcg_score(ones_array, merged_df['correct'], k=1000)

    return metrics

In [33]:
metrics_dict = {}
for i in range(1, 22):
  col_of_interest = f"max_cs_{i}"

  sorted_i = clean_ef_data.sort_values(by=[col_of_interest], ascending=False)

  consensus_labels_i = consensus_labels[consensus_labels['query'] == i]

  merged_i = sorted_i.merge(consensus_labels_i, on='docid', how='right')

  merged_i = merged_i[['docid', 'TEXT', col_of_interest, 'q0', 'rel']]
  merged_i = merged_i.head(1000)

  # Set up 'predicted' column
  merged_i['pred_rel'] = np.where(merged_i[col_of_interest].isna(), 0, 1)

  merged_i['correct'] = -1

  merged_i['correct'] = merged_i.apply(assign_correct_class, axis=1)

  metrics_dict[f"metrics_q{i}"] = calculate_metrics(merged_i, col_of_interest)

Calculate average metrics

In [40]:
metrics_df = pd.DataFrame(metrics_dict).T

metrics_keys = ['precision_at_10', 'r_precision', 'average_precision']

overall_p10 = metrics_df['precision_at_10'].mean()
overall_r = metrics_df['r_precision'].mean()
overall_precision = metrics_df['average_precision'].mean()


metrics_df.loc['overall_avg'] = [overall_p10, overall_r, overall_precision]
metrics_df.to_csv('overall_metrics.csv')

In [36]:
i = 1
col_of_interest = f"max_cs_{i}"

sorted_i = clean_ef_data.sort_values(by=[col_of_interest], ascending=False)

consensus_labels_i = consensus_labels[consensus_labels['query'] == i]

merged_i = sorted_i.merge(consensus_labels_i, on='docid', how='right')

merged_i = merged_i[['docid', 'TEXT', col_of_interest, 'q0', 'rel']]

merged_i['pred_rel'] = np.where(merged_i['max_cs_1'].isna(), 0, 1)

merged_i.head(100)

Unnamed: 0,docid,TEXT,max_cs_1,q0,rel,pred_rel
0,s_405_1279_15,,,0,1,0
1,s_2519_356_0,,,0,0,0
2,s_2038_51_7,Since I feel rejected i have been feeling sad.,0.986091,0,1,1
3,s_975_61_2,,,0,0,0
4,s_577_923_1,I am sad waiting.,0.986091,0,1,1
...,...,...,...,...,...,...
95,s_1833_118_11,,,0,0,0
96,s_414_319_0,,,0,0,0
97,s_1811_1332_2,,,0,0,0
98,s_1636_312_2,I'm actually sad now.,0.986091,0,1,1


In [37]:
unfiltered_trec = pd.read_csv("../Project-Part-2-Data/tabulated_cleaned_unfiltered_trec.csv")

In [39]:
unfiltered_trec[unfiltered_trec['docid'] == 's_915_45_49']

Unnamed: 0,docid,PRE,TEXT,POST
1716623,s_915_45_49,,I would spend days unable to get out of my bed...,
