### Create_articlesEmbeddings

In [1]:
# load packages
import pandas as pd
import spacy
import pickle
import os
import random
import numpy as np
os.getcwd()

'c:\\Users\\xenia\\OneDrive\\Desktop\\Personalisation in public media\\Assignment_2'

In [2]:
# open cleaned data
with open("C:/Users/xenia/OneDrive/Desktop/Personalisation in public media/Assignment_2/articles_cleaned.pkl", "rb") as file:
    articles = pickle.load(file)

In [3]:
# word2vec model 

import gensim
from gensim.models import Word2Vec

#vector model
def getWordVectors(tokenized_texts):
    SIZE = 100 # dimensions of the embeddings
    SG = 1 # whether to use skip-gram or CBOW (we use skip-gram)
    WINDOW = 10 # the window size
    N_WORKERS = 1 # number of workers to use
    MIN_COUNT = 1

    model = Word2Vec(vector_size=SIZE,
                    sg=SG,
                    window=WINDOW, 
                    min_count=MIN_COUNT,
                    workers=N_WORKERS)

    model.build_vocab(tokenized_texts)

    model.train(tokenized_texts,
            total_examples=model.corpus_count,
            epochs=model.epochs) 
    
    return model



In [4]:
# article vectors
articles_vectors = getWordVectors(articles["cleaned_article"])

In [5]:
# find similar words function --> finds the top 10 similar words for each trigger 
def findSimilarWords(wordEmbeddings, wordDict):
    similarWordsDict = {}
    for word in wordDict:
        # check if the word is present in the vocabulary
        if word in wordEmbeddings.wv.key_to_index:
            # add similar words for this word to the dictionary, including the word itself
            similarWordsDict[word] = [word] + [w[0] for w in wordEmbeddings.wv.most_similar(word, topn=10)]
    return similarWordsDict

# Top N similar words for the articles
# trigger words are the same used in the survey translated in Dutch
# Some of these, we ended up not using because they appeared too little in the corpus to
# Make good embeddings
trigger_words = {
    "zelfmoord": None,
    "ongelukken": None,
    "fysieke_mishandeling": None,
    "seksueel_misbruik": None,
    "zelfbeschadiging": None,
    "depressie": None,
    "racisme": None,
    "LGBTQ+-discriminatie": None,
    "eetstoornissen": None,
    "gehandicaptendiscriminatie": None,
    "dierenmishandeling": None
}
similarWords_articles = findSimilarWords(articles_vectors,trigger_words)



In [10]:
# Compute trigger concept vectors by averaging the word vectors for the trigger words and the top 10 similar words associated with each trigger concept.    

def create_trigger_vecs(similarWords_articles,articles_vectors):
    """
    Parameters:
    - similarWords_articles(dict): A dictionary where keys are trigger concepts and values are lists of words associated with each trigger created in previous code.
    - article_vectors (KeyedVectors): Pre-trained word vectors from previous code.
    """
    trigger_vecs = {}
    for trigger, words in similarWords_articles.items():
        vecs = [articles_vectors.wv.get_vector(word) for word in words if word in articles_vectors.wv.key_to_index]
        trigger_vecs[trigger] = np.mean(vecs, axis=0)

    return trigger_vecs # A dictionary where keys are trigger concepts and values are mean vectors for the associated words.
    
trigger_vecs = create_trigger_vecs(similarWords_articles,articles_vectors)

In [7]:
# optimized code using chatgp

def calculate_trigger_scores_df(articles, trigger_vecs):
     # Create a new DataFrame with the same index as the input articles DataFrame and columns for each trigger word
    trigger_scores_df = pd.DataFrame(index=articles.index, columns=trigger_vecs.keys())

    # precompute article vectors
    article_vecs = []

     #  average vector representations of each article in a list
    for article_idx in articles.index:
        article_text = ' '.join(articles.loc[article_idx, 'cleaned_article'])
        word_vecs = [articles_vectors.wv.get_vector(word) for word in article_text.split() if word in articles_vectors.wv.key_to_index]
        article_vec = np.mean(word_vecs, axis=0)
        article_vecs.append(article_vec)

    
    # average vector representation of each trigger and trigger words(top 10 most similar words to trigger)
    for trigger in trigger_vecs:
        similar_words = [word[0] for word in articles_vectors.wv.most_similar(positive=[trigger], topn=10)]
        similar_vecs = [articles_vectors.wv.get_vector(word) for word in similar_words if word in articles_vectors.wv.key_to_index]
        similar_vecs.append(trigger_vecs[trigger])
        combined_vec = np.mean(similar_vecs, axis=0)
        # the cosine similarity between the article vector and the combined vector(includes trigger word + top 10 most similar words) for the current trigger word
        for i, article_idx in enumerate(articles.index):
            article_vec = article_vecs[i]
            similarity = np.dot(article_vec, combined_vec) / (np.linalg.norm(article_vec) * np.linalg.norm(combined_vec))
            trigger_scores_df.at[article_idx, trigger] = similarity

        
    # the trigger_scores_df DataFrame as output with each trigger scores per trigger per article
    return trigger_scores_df





In [8]:
articles_embeddings = calculate_trigger_scores_df(articles,trigger_vecs)

# Rename columns
articles_embeddings = articles_embeddings.rename(columns={'zelfmoord': 'guard_suicide', 'ongelukken': 'guard_accidents','zelfbeschadiging': 'guard_selfHarm','depressie':'guard_depression',
                        'racisme':'guard_racism','eetstoornissen':'guard_eatingDisorders'})

# Save dataframe as CSV file
articles_embeddings.to_csv('articles_embeddings.csv', index=False)  

articles_embeddings  

#Limitation: If a word is not present in the dictionary (in wordEmbeddings.wv.key_to_index), 
#  the findSimilarWords() function will not add it to the similarWordsDict dictionary.
#  And that trigger word will not have any associated similar words in the output. For this reason, only 6 of the triggers are in the final dataframe.

Unnamed: 0,guard_suicide,guard_accidents,guard_selfHarm,guard_depression,guard_racism,guard_eatingDisorders
0,0.68904,0.7962,0.791906,0.673404,0.623649,0.817572
1,0.683451,0.825734,0.757744,0.603,0.538758,0.753499
2,0.619136,0.794339,0.764686,0.650075,0.465704,0.784149
3,0.654318,0.664056,0.791714,0.648363,0.640479,0.819716
4,0.618715,0.746867,0.758392,0.577344,0.443122,0.75944
...,...,...,...,...,...,...
22596,0.621547,0.670748,0.774785,0.634767,0.519427,0.774898
22597,0.659439,0.904112,0.777663,0.685988,0.485242,0.791
22598,0.743001,0.752128,0.81698,0.669132,0.658693,0.829151
22599,0.618995,0.703332,0.785767,0.622811,0.562866,0.796512
