In [1]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from gensim import models
import ast



# Load and Clean data

In [2]:
# import xlsx file
raw_df = pd.read_excel('../Data/clean_ExpertAnnotated_survey_data_AI4Journalist.xlsx') 

In [3]:
# extract the columns: group, ResponseId, Finished, 1_writing_a, 1_writing_b, 2_writing_a, 2_writing_b
# combine 1_writing_a and 2_writing_a into one column, combone 1_writing_b and 2_writing_b into one column
# rename the columns to: group, ResponseId, Finished, writing_a, writing_b

df = raw_df[['group', 'ResponseId', 'Finished', '1_writing_a', '1_writing_b', '2_writing_a', '2_writing_b']]
df['writing_a'] = df['1_writing_a'].fillna('') + df['2_writing_a'].fillna('')
df['writing_b'] = df['1_writing_b'].fillna('') + df['2_writing_b'].fillna('')
df = df[['group', 'ResponseId', 'Finished', 'writing_a', 'writing_b']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['writing_a'] = df['1_writing_a'].fillna('') + df['2_writing_a'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['writing_b'] = df['1_writing_b'].fillna('') + df['2_writing_b'].fillna('')


In [4]:
import string
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [5]:
df['writing_a_clean_lst'] = df['writing_a'].str.lower()
df['writing_a_clean_lst'] = df['writing_a_clean_lst'].apply(remove_punctuation)
df['writing_a_clean_lst'] = df['writing_a_clean_lst'].str.split()

df['writing_b_clean_lst'] = df['writing_b'].str.lower()
df['writing_b_clean_lst'] = df['writing_b_clean_lst'].apply(remove_punctuation)
df['writing_b_clean_lst'] = df['writing_b_clean_lst'].str.split()

# Within Subject

## Word2Vec embedding

In [6]:
# use the pre-trained word vectors from Google News

# load the pre-trained word vectors
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

def get_cosine_similarity(writing_a_lst: list, writing_b_lst: list):
    if not writing_a_lst or not writing_b_lst:
        return 0
    else:
        words_a = [word for word in writing_a_lst if word in word2vec_model.key_to_index]
        words_b = [word for word in writing_b_lst if word in word2vec_model.key_to_index]
        if len(words_a) == 0 or len(words_b) == 0:
            return 0
        else:
            vector_a = np.mean([word2vec_model[word] for word in words_a], axis=0)
            vector_b = np.mean([word2vec_model[word] for word in words_b], axis=0)
            norm_a = np.linalg.norm(vector_a)
            norm_b = np.linalg.norm(vector_b)
            if norm_a == 0 or norm_b == 0:
                return 0
            return np.dot(vector_a, vector_b) / (norm_a * norm_b)
        
def get_unnormalized_dot_product(writing_a_lst: list, writing_b_lst: list):
    if not writing_a_lst or not writing_b_lst:
        return 0
    else:
        words_a = [word for word in writing_a_lst if word in word2vec_model.key_to_index]
        words_b = [word for word in writing_b_lst if word in word2vec_model.key_to_index]
        if len(words_a) == 0 or len(words_b) == 0:
            return 0
        else:
            vector_a = np.mean([word2vec_model[word] for word in words_a], axis=0)
            vector_b = np.mean([word2vec_model[word] for word in words_b], axis=0)
            return np.dot(vector_a, vector_b)

In [None]:
df['cosine_similarity_w2v'] = df.apply(lambda row: get_cosine_similarity(row['writing_a_clean_lst'], row['writing_b_clean_lst']), axis=1)
df['unnormalized_dot_product_w2v'] = df.apply(lambda row: get_unnormalized_dot_product(row['writing_a_clean_lst'], row['writing_b_clean_lst']), axis=1)

## OpenAI embedding

In [None]:
from openai import OpenAI
client = OpenAI()

def get_openai_embedding(text, model="text-embedding-3-small"):
   if text is None or text == "":
        return 0
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
# get the OpenAI embeddings for writing_a and writing_b
# least pre-processing, so use raw texts
df['openai_embedding_a'] = df['writing_a'].apply(get_openai_embedding)
df['openai_embedding_b'] = df['writing_b'].apply(get_openai_embedding)

In [None]:
# def get_cosine_similarity_openai(writing_a: str, writing_b: str):
#     embedding_a = get_openai_embedding(writing_a)
#     embedding_b = get_openai_embedding(writing_b)
#     norm_a = np.linalg.norm(embedding_a)
#     norm_b = np.linalg.norm(embedding_b)
#     if norm_a == 0 or norm_b == 0:
#         return 0
#     return np.dot(embedding_a, embedding_b) / (norm_a * norm_b)

# def get_unnormalized_dot_product_openai(writing_a: str, writing_b: str):
#     embedding_a = get_openai_embedding(writing_a)
#     embedding_b = get_openai_embedding(writing_b)
#     return np.dot(embedding_a, embedding_b)

def get_cosine_similarity_openai(embedding_a, embedding_b):
    norm_a = np.linalg.norm(embedding_a)
    norm_b = np.linalg.norm(embedding_b)
    if norm_a == 0 or norm_b == 0:
        return 0
    return np.dot(embedding_a, embedding_b) / (norm_a * norm_b)

def get_unnormalized_dot_product_openai(embedding_a, embedding_b):
    return np.dot(embedding_a, embedding_b)

def get_euclidean_distance_openai(embedding_a, embedding_b):
    return np.linalg.norm(np.array(embedding_a) - np.array(embedding_b))

# openai embedding output is normalized, so cosine similarity is the dot product of the embeddings
df["cosine_similarity_openai"] = df.apply(lambda row: get_cosine_similarity_openai(row['openai_embedding_a'], row['openai_embedding_b']), axis=1)
# df["unnormalized_dot_product_openai"] = df.apply(lambda row: get_unnormalized_dot_product_openai(row['openai_embedding_a'], row['openai_embedding_b']), axis=1)
df["euclidean_distance_openai"] = df.apply(lambda row: get_euclidean_distance_openai(row['openai_embedding_a'], row['openai_embedding_b']), axis=1)

## LDA

In [None]:
from gensim import corpora, models
from gensim.matutils import cossim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Combine all words from both columns for dictionary creation
all_words = df['writing_a_clean_lst'].tolist() + df['writing_b_clean_lst'].tolist()

# Flatten the list of lists
all_words_flat = [word for sublist in all_words for word in sublist]

# Create dictionary and corpus
dictionary = corpora.Dictionary([all_words_flat])
corpus = [dictionary.doc2bow(text) for text in df['writing_a_clean_lst'].tolist() + df['writing_b_clean_lst'].tolist()]

# Train the LDA model
lda_model = models.LdaModel(corpus, num_topics=2, id2word=dictionary)

# Function to get LDA vector
def get_lda_vector(word_lst: list):
    bow = dictionary.doc2bow(word_lst)
    lda_vector = lda_model[bow]
    return lda_vector

# Similarity calculation function using LDA vectors
def get_cosine_similarity_lda(writing_a_lst, writing_b_lst):
    if not writing_a_lst or not writing_b_lst:
        return 0
    vector_a = get_lda_vector(writing_a_lst)
    vector_b = get_lda_vector(writing_b_lst)
    return cossim(vector_a, vector_b)

# def get_unnormalized_dot_product_lda(writing_a_lst, writing_b_lst):
#     if not writing_a_lst or not writing_b_lst:
#         return 0
#     vector_a = get_lda_vector(writing_a_lst)
#     vector_b = get_lda_vector(writing_b_lst)
#     return np.dot(vector_a, vector_b)

# Apply the similarity function to each row of the DataFrame
df['lda_cosine_similarity'] = df.apply(lambda row: get_cosine_similarity_lda(row['writing_a_clean_lst'], row['writing_b_clean_lst']), axis=1)
# df['lda_unnormalized_dot_product'] = df.apply(lambda row: get_unnormalized_dot_product_lda(row['writing_a_clean_lst'], row['writing_b_clean_lst']), axis=1)

In [None]:
df.to_excel('../Data/within_subject_similarity.xlsx', index=False)

# Between Subject

In [7]:
similarity_df = pd.read_excel('../Data/within_subject_similarity.xlsx')
similarity_df['writing_a_clean_lst'] = similarity_df.writing_a_clean_lst.apply(ast.literal_eval)
similarity_df['writing_b_clean_lst'] = similarity_df.writing_b_clean_lst.apply(ast.literal_eval)

In [8]:
expert_annotated_df = pd.read_csv('../Data/annotated_data_240520.csv')

In [9]:
# merge df and expert_evaluated_df
# drop rows with NA journalist value
merged_df = pd.merge(similarity_df, expert_annotated_df[['ResponseId', 'journalist']], on='ResponseId', how='inner').dropna(subset=['journalist']).reset_index(drop=True)
merged_df.journalist = merged_df.journalist.apply(lambda x: int(x))

In [10]:
merged_df

Unnamed: 0,group,ResponseId,Finished,writing_a,writing_b,writing_a_clean_lst,writing_b_clean_lst,cosine_similarity_w2v,unnormalized_dot_product_w2v,openai_embedding_a,openai_embedding_b,cosine_similarity_openai,euclidean_distance_openai,lda_cosine_similarity,journalist
0,2,R_4dnZuujLN4Mn9um,True,"""New study reveals complex impact of psychiatr...","\n""Study finds alarming link between gun viole...","[new, study, reveals, complex, impact, of, psy...","[study, finds, alarming, link, between, gun, v...",0.810496,0.713841,"[-0.025056514889001846, -0.03770783916115761, ...","[0.023708993569016457, 0.02937544323503971, 0....",0.448127,1.050593,0.524188,1
1,1,R_6G8TRK7mezCmvlE,True,🧠💡 New Insights in Managing Suicide Risk! Rece...,🚨 New study reveals a stark reality: Exposure ...,"[🧠💡, new, insights, in, managing, suicide, ris...","[🚨, new, study, reveals, a, stark, reality, ex...",0.825563,0.754296,"[-0.018861589953303337, -0.020371491089463234,...","[0.020731830969452858, 0.016379600390791893, 0...",0.549861,0.94883,0.038555,2
2,2,R_7GUlxEwsm4ANY9n,True,"""New study finds psychiatric hospitalization r...",A Study on 3015 Black adults shows GVE signifi...,"[new, study, finds, psychiatric, hospitalizati...","[a, study, on, 3015, black, adults, shows, gve...",0.740004,0.790497,"[-0.015813207253813744, -0.028458785265684128,...","[0.04760603606700897, 0.021079791709780693, 0....",0.41252,1.083956,0.043443,3
3,2,R_2fPKnwwJ3SXlNdN,True,Psychiatric hospitalization reduces suicide ri...,New study by @AMAJournal reveals alarming asso...,"[psychiatric, hospitalization, reduces, suicid...","[new, study, by, amajournal, reveals, alarming...",0.783746,0.683887,"[-0.006001750472933054, -0.028704537078738213,...","[0.025258364155888557, 0.012829821556806564, 0...",0.507656,0.992314,0.047468,4
4,1,R_2saDzTZSaCPpZNs,True,"""New study in @JAMAPsych shows psychiatric hos...","""New research suggests that reducing gun viole...","[new, study, in, jamapsych, shows, psychiatric...","[new, research, suggests, that, reducing, gun,...",0.682288,0.733727,"[-0.02892470918595791, -0.03582962229847908, 0...","[0.054963596165180206, 0.011303708888590336, 0...",0.522809,0.976924,0.750716,5
5,1,R_2g7nDxYeFPr352v,True,Unraveling the puzzle of psychiatric hospitali...,"""Research shows stark connections: Gun violenc...","[unraveling, the, puzzle, of, psychiatric, hos...","[research, shows, stark, connections, gun, vio...",0.80981,0.757519,"[-0.0008465779246762395, -0.024789869785308838...","[0.022167474031448364, 0.019238609820604324, 0...",0.51612,0.983748,0.249582,6
6,1,R_63y8AAf8TLPGRO3,True,New study reveals: Psychiatric hospitalization...,"""New study reveals a strong link between gun v...","[new, study, reveals, psychiatric, hospitaliza...","[new, study, reveals, a, strong, link, between...",0.772005,0.815978,"[-0.025526609271764755, -0.01785901188850403, ...","[0.021031426265835762, 0.018929390236735344, 0...",0.551211,0.947406,0.047959,7
7,2,R_2Pe7Wu3hNtg2ZU6,True,🔍 New study finds psychiatric hospitalization ...,🔫🧠 Study finds Black adults exposed to gun vio...,"[🔍, new, study, finds, psychiatric, hospitaliz...","[🔫🧠, study, finds, black, adults, exposed, to,...",0.731112,0.827849,"[-0.042180195450782776, -0.018665548413991928,...","[0.03014543652534485, 0.04210395738482475, 0.0...",0.481462,1.01837,0.998329,8
8,2,R_4Ld8n99yqQacpmu,True,"""💡 Think psychiatric hospitalization's the ult...","""New study finds alarming link between gun vio...","[💡, think, psychiatric, hospitalizations, the,...","[new, study, finds, alarming, link, between, g...",0.809953,0.653381,"[-0.011265125125646591, -0.04513534903526306, ...","[0.0172509104013443, 0.037992093712091446, 0.0...",0.442072,1.056341,0.041147,9
9,1,R_7EnJjQXJawFTRwm,True,New study shows psychiatric hospitalization re...,"""Study reveals Black individuals exposed to gu...","[new, study, shows, psychiatric, hospitalizati...","[study, reveals, black, individuals, exposed, ...",0.772831,0.963211,"[-0.01255673449486494, -0.03516360744833946, 0...","[0.02446923218667507, 0.013484450988471508, 0....",0.570545,0.926774,0.999906,10


In [38]:
# generate the similarity score matrix
# Create similarity matrices
num_users = len(merged_df) # 29
similarity_matrix_a = np.zeros((num_users, num_users))
similarity_matrix_b = np.zeros((num_users, num_users))

# Calculate the similarity for writing_a
for i in range(num_users):
    for j in range(num_users):
        similarity_matrix_a[i, j] = get_cosine_similarity(merged_df.loc[i, 'writing_a_clean_lst'], merged_df.loc[j, 'writing_a_clean_lst'])
        similarity_matrix_b[i, j] = get_cosine_similarity(merged_df.loc[i, 'writing_b_clean_lst'], merged_df.loc[j, 'writing_b_clean_lst'])

# Convert matrices to DataFrames for better readability
similarity_df_a = pd.DataFrame(similarity_matrix_a, columns=list(range(1,30)))
similarity_df_b = pd.DataFrame(similarity_matrix_b, columns=list(range(1,30)))

In [39]:
similarity_df_a.to_excel('../Data/between_subject_similarity_matrix_a.xlsx', index=False)
similarity_df_b.to_excel('../Data/between_subject_similarity_matrix_b.xlsx', index=False)

In [40]:
similarity_df_b

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,20,21,22,23,24,25,26,27,28,29
0,1.0,0.897777,0.865769,0.927274,0.814605,0.849445,0.897746,0.899409,0.888698,0.84319,...,0.885971,0.857606,0.893579,0.873972,0.980201,0.88358,0.829955,0.889992,0.852783,0.851562
1,0.897777,1.0,0.88639,0.882154,0.864359,0.858079,0.926472,0.893035,0.868674,0.866818,...,0.93351,0.861892,0.910783,0.848377,0.90298,0.87764,0.813705,0.894445,0.823056,0.850992
2,0.865769,0.88639,1.0,0.896055,0.839453,0.868302,0.891039,0.851946,0.888574,0.828325,...,0.862271,0.894841,0.904236,0.840616,0.87808,0.85945,0.898116,0.86605,0.841487,0.893639
3,0.927274,0.882154,0.896055,1.0,0.814755,0.824425,0.892607,0.869157,0.885526,0.836706,...,0.872986,0.856752,0.89344,0.848497,0.942968,0.874131,0.866142,0.870952,0.836712,0.883679
4,0.814605,0.864359,0.839453,0.814755,1.0,0.788036,0.867569,0.789142,0.840695,0.767226,...,0.876847,0.81436,0.839335,0.789835,0.831192,0.813671,0.846855,0.769716,0.802049,0.817092
5,0.849445,0.858079,0.868302,0.824425,0.788036,1.0,0.855494,0.845567,0.894111,0.861864,...,0.869659,0.887042,0.858907,0.880448,0.869259,0.851684,0.855571,0.879893,0.838531,0.894163
6,0.897746,0.926472,0.891039,0.892607,0.867569,0.855494,1.0,0.870008,0.913901,0.851123,...,0.9316,0.88298,0.895631,0.875633,0.916188,0.878453,0.86026,0.885978,0.849385,0.865981
7,0.899409,0.893035,0.851946,0.869157,0.789142,0.845567,0.870008,1.0,0.852274,0.880922,...,0.874532,0.896833,0.946425,0.808719,0.899147,0.838027,0.786266,0.880878,0.755726,0.832224
8,0.888698,0.868674,0.888574,0.885526,0.840695,0.894111,0.913901,0.852274,1.0,0.849303,...,0.892467,0.884985,0.873981,0.911276,0.909959,0.889372,0.905538,0.867178,0.893887,0.908533
9,0.84319,0.866818,0.828325,0.836706,0.767226,0.861864,0.851123,0.880922,0.849303,1.0,...,0.872618,0.839869,0.84962,0.791084,0.86652,0.794668,0.767258,0.86131,0.751409,0.840067
