In [None]:
# Connect with drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import libraries
import pandas as pd
from gensim import corpora, models, utils
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
import os

In [None]:
# Address to project folder
project_folder = "/content/drive/MyDrive/2024SUDSProject/Morality/"
dataset_folder = "/content/drive/MyDrive/2024SUDSProject/processedNewsData/"

In [None]:
# Declaring labels
scope = [
    'care.virtue', 'fairness.virtue', 'loyalty.virtue', 'authority.virtue', 'sanctity.virtue',
    'care.vice', 'fairness.vice', 'loyalty.vice', 'authority.vice', 'sanctity.vice'
]

label_order = [
    'left', 'left-center', 'center', 'right-center', 'right', 'extreme-right', 'conspiracy-pseudoscience','questionable-source'
]

# Assign a year
years = ['2017', '2018', '2019', '2020', '2021', '2022']

years = ['2019', '2020', '2021', '2022']

In [None]:
# Open moral foundation dictionary, set the index to the word
emfd = pd.read_csv(dataset_folder + 'emfd_amp.csv')

labels = pd.read_csv(dataset_folder + 'labels_all_2022.csv')

#Calculate the score for each word
word2vec_model = KeyedVectors.load_word2vec_format(dataset_folder+'GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
def create_emfd_priori():
    foundation_to_index = {foundation: idx for idx, foundation in enumerate(scope)}

    # Compiles dictionary to find the top 10 words associated with
    # each moral foundation category, stores the within moral_foundation_seed_dict

    emfd_priori = {}

    for foundation in scope:
        # Find columns related to the current foundation
        # Collect top 10 words from each relevant column
        # 27 is the optimal number for least amount of overlap with most amount of unique words per category
        top_words = emfd.nlargest(27, foundation)['word']

        # Map each top word to the current foundation index
        for word in top_words:
            if word not in emfd_priori:
                emfd_priori[word] = []
            emfd_priori[word].append(foundation_to_index[foundation])

    return emfd_priori

def open_emfd_priori():
    if os.path.exists(dataset_folder + f'step4_emfd_priori.pkl'):
        with open(dataset_folder + f'step4_emfd_priori.pkl', 'rb') as file:
            emfd_priori = pickle.load(file)
        return emfd_priori
    else:
        emfd_priori = create_emfd_priori()
        with open(dataset_folder + f'step4_emfd_priori.pkl', 'wb') as file:
            pickle.dump(emfd_priori, file, protocol=pickle.HIGHEST_PROTOCOL)
        return emfd_priori

In [None]:
def open_bow(year):
    with open(dataset_folder + f'step3_bow_{year}.pkl', 'rb') as file:
        bow = pickle.load(file)
    return bow

def open_dict(year):
    with open(dataset_folder + f'step3_dictionary_{year}.pkl', 'rb') as file:
        dictionary = pickle.load(file)
    return dictionary

In [None]:
def initialize_guidedlda_v2(eta, dictionary, bow, ntopics, year):
    file_path = dataset_folder + f'{year}_lda_model_gensim.pkl'
    if os.path.exists(file_path):
        with open(file_path, 'rb') as file:
            print('model loaded')
            model = pickle.load(file)

        return model

    np.random.seed(42) # set the random seed for repeatability
    print('model created')
    with (np.errstate(divide='ignore')):  # ignore divide-by-zero warnings
        model = models.ldamulticore.LdaMulticore(
            corpus=bow, id2word=dictionary, num_topics=ntopics,
            random_state=42, eta=eta,
            eval_every=-1,
            passes=150, per_word_topics=False)

    return model

def create_eta(priors, etadict, ntopics):
    eta = np.full(shape=(ntopics, len(etadict)), fill_value=1) # create a (ntopics, nterms) matrix and fill with 1
    for word, topics in priors.items(): # for each word in the list of priors
        keyindex = [index for index,term in etadict.items() if term==word] # look up the word in the dictionary
        if (len(keyindex)>0): # if it's in the dictionary
            for topic in topics:
              eta[topic,keyindex[0]] = 1e7  # put a large number in there
    eta = np.divide(eta, eta.sum(axis=0)) # normalize so that the probabilities sum to 1 over all topics

    return eta

In [None]:
def get_topic_words_and_contributions(model, dictionary):

    topic_word_distributions = model.get_topics()

    # We specify the number of top words we want from each topic
    num_top_words = 100

    # Get total vocabulary of dictionary, so all the words that we've seen so far
    feature_names = list(dictionary.token2id.keys())

    return [get_top_word_distributions(topic_word_distribution, feature_names, num_top_words) for topic_word_distribution in topic_word_distributions]

def get_top_word_distributions(topic_word_distribution, feature_names, n_top_words):
    # Sorts indices of the topic-word distribution array in descending order of value
    # Selects indices corresponding to the top 'n_top_words' words for each topic
    top_word_indices = topic_word_distribution.argsort()[:-n_top_words - 1:-1]

    # Retrieves the corresponding words from the feature names array using the selected indices
    top_words = [feature_names[i] for i in top_word_indices]

    # Computes contributions of each top word to its respective topic
    # It divides the probabilities of the top words by the sum of probabilities for all words in each topic.
    contributions = topic_word_distribution[top_word_indices] / topic_word_distribution.sum()
    # TODO: try swapping this for the total probability of top 100 words instead

    # Pairs each top word with its corresponding contribution to the topic
    return list(zip(top_words, contributions))

In [None]:
# Calculates the similarity score of a word to the moral foundation dictionary
def get_emfd_scores(word):
    word_scores = {moral_foundation_category: 0 for moral_foundation_category in scope}

    # Check if the word is in the moral foundation dictionary
    if word in emfd.index:
        word_scores = in_emfd_score(word_scores, word)
    else:
        word_scores = out_emfd_score(word_scores, word)

    return word_scores

def in_emfd_score(word_scores, word):
    for moral_foundation_category in scope:
            # Adds the corresponding value for that word in emfd to word_scores
            word_scores[moral_foundation_category] += emfd.loc[word, moral_foundation_category]

    return word_scores

def out_emfd_score(word_scores, word):
    sims = []

    for emfd_word in emfd.index:
        try:
            # Calculate similarity between the word and each word in the moral foundation dictionary
            sim = word2vec_model.similarity(word, emfd_word)
            sims.append((emfd_word, sim))
        except:
            # If similarity calculation fails, skip ahead
            continue

    # Get top 10 most similar words
    top_10_words = sorted(sims, key=lambda x: x[1], reverse=True)[:10]
    total_sim_weight = sum([sim for word, sim in top_10_words])

    # If total_sim_weight > 0, then there are valid similarity scores.
    if total_sim_weight > 0:
        for word, sim in top_10_words:
            for moral_foundation_category in scope:
                word_scores[moral_foundation_category] += (emfd.loc[word, moral_foundation_category] * sim) / total_sim_weight

    return word_scores

# Given the topic_top_words_and_contributions, appends the extended moral
# foundation dictionary score, and returns it as a dataframe

def find_word_emfd_scores_df(topic_top_words_and_contributions):
    word_data_list = []

    for topic_index, topic in enumerate(topic_top_words_and_contributions):
        for word, contribution in topic:
            word_contribution_dict = {'Topic': topic_index, 'Word': word, 'Contribution': contribution}
            word_contribution_dict.update(get_emfd_scores(word))
            word_data_list.append(word_contribution_dict)

    return pd.DataFrame(word_data_list)

In [None]:
# Save LDA model
def save_LDA_model(model, year):
    with open(project_folder+f'models/{year}_lda_model_gensim.pkl', 'wb') as file:
        pickle.dump(model, file, protocol=pickle.HIGHEST_PROTOCOL)

def save_doc_topic_distribution_df(doc_topic_dist_df, year):
  doc_topic_dist_df.to_csv(dataset_folder + f'doc_topic_dist_{year}.csv', index=False)

def save_doc_mf_scores_df(doc_mf_scores_df, year):
    doc_mf_scores_df.to_csv(dataset_folder + f'doc_mf_scores_{year}.csv', index=False)

In [None]:
# Group the data by topic and calculate the average contribution for each scope.
def get_topic_scores(word_scores_df):
    # Start by initializing an empty list to store results
    results = []

    # Iterate over unique topics
    for topic in word_scores_df['Topic'].unique():
        group = word_scores_df[word_scores_df['Topic'] == topic]
        topic_result = {}

        # Calculate weighted averages for each column in scope
        for column in scope:
            weighted_avg = np.average(group[column], weights=group['Contribution'])
            topic_result[column] = weighted_avg

        # Append the result for the current topic to the results list
        results.append(topic_result)

    topic_scores = pd.DataFrame(results)

    # Resets the indexes of topic_scores
    topic_scores.reset_index(inplace=True, drop=True)

    # Convert results list of dictionaries into a DataFrame
    return topic_scores

In [None]:
def get_topic_dist_df(bow):
    doc_topic_dists = [model.get_document_topics(doc, minimum_probability=0) for doc in bow]

    # Create a DataFrame from the topic distributions

    topic_matrix = []
    for doc_topics in doc_topic_dists:
        row = [prob for _, prob in doc_topics]
        topic_matrix.append(row)

    doc_topic_dist_df = pd.DataFrame(topic_matrix)

    # Add the filename column and format the dataframe
    labels = pd.read_csv(dataset_folder+f'combined_data_preprocessed_{year}_lemma.csv', usecols=['id'])

    doc_topic_dist_df['Filename'] = labels['id']
    doc_topic_dist_df['Topic_Distribution'] = doc_topic_dist_df.iloc[:, :-3].max(axis=1)

    return doc_topic_dist_df

In [None]:
def create_doc_mf_scores(doc_topic_dist_df, topic_scores):
    doc_mf_scores_list = []

    # Iterate over each document
    for index, row in doc_topic_dist_df.iterrows():

        # Dictionary to store the moral foundation score for the current document
        doc_mf_scores = {'Filename': row['Filename']}

        # Getting topic distribution for the current document
        topic_distribution = row[:-1]  # adjust the index to match your data

        # Calculate the moral foundation score for the current document
        for category in scope:
            weighted_score = sum(
                topic_distribution[i] * topic_scores.loc[i, category]
                for i in range(len(topic_scores))
            )
            doc_mf_scores[category] = weighted_score

        # Append the scores dictionary to the doc_mf_scores_list
        doc_mf_scores_list.append(doc_mf_scores)

    # Create a new dataframe from the doc_mf_scores_list
    doc_mf_scores_df = pd.DataFrame(doc_mf_scores_list)
    # doc_mf_scores_df.to_csv(project_folder + f'data/{year}_doc_mf_scores_1.csv', index=False)

    doc_mf_scores_df['outlet'] = doc_mf_scores_df['Filename'].str.split('--').str[0]
    doc_mf_scores_df['label'] = doc_mf_scores_df.apply(get_label, axis=1)

    return doc_mf_scores_df

def save_doc_mf_scores_df(doc_mf_scores_df, year):
    doc_mf_scores_df.to_csv(dataset_folder + f'doc_mf_scores_{year}.csv', index=False)

def get_label(x):
    values = labels.loc[labels['source'] == x['outlet'], 'bias'].values
    return values[0] if len(values) > 0 else None

In [None]:
def open_dict(year):
    with open(dataset_folder + f'step3_dictionary_{year}.pkl', 'rb') as file:
        dictionary = pickle.load(file)
    return dictionary

In [None]:
years = ['2019']

emfd_priori = open_emfd_priori()
print(emfd_priori)
# emfd.set_index('word', inplace=True)

for year in years:
    dictionary = open_dict(year)
    eta = create_eta(emfd_priori, dictionary, ntopics=10)
    bow = open_bow(year)

    # Initialize and train the model

    model = initialize_guidedlda_v2(eta, dictionary, bow, 10, year)

    # save_LDA_model(model, year)

    topic_words_and_contributions = get_topic_words_and_contributions(model, dictionary)

    word_scores_df = find_word_emfd_scores_df(topic_words_and_contributions)

    # Get topic scores
    topic_scores = get_topic_scores(word_scores_df)

    doc_topic_dist_df = get_topic_dist_df(bow)
    # save_doc_topic_distribution_df(doc_topic_dist_df, year)

    # Create a dataframe of the moral foundation scores for each document
    doc_mf_scores_df = create_doc_mf_scores(doc_topic_dist_df, topic_scores)
    # save_doc_mf_scores_df(doc_mf_scores_df, year)