In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install sentence_transformers

import pandas as pd
import numpy as np
from collections import Counter
from sentence_transformers import SentenceTransformer, util
from transformers import BertModel, BertTokenizer
import torch
import matplotlib.pyplot as plt
from scipy.stats import t, sem

In [None]:
# Address to project folder
project_folder = "/content/drive/MyDrive/2024SUDSProject/Threat/"
dataset_folder = "/content/drive/MyDrive/2024SUDSProject/processedNewsData/"

df = pd.read_csv(dataset_folder+'combined_honor_score.csv')

# Step 6: Display the column values
print(df.head())  # Print column names

In [None]:
# Merge datasets and process labels
def process_labels(tokens_full, orientations):
    # I don't think pd.merge reads in a csv/pickle file
    # What data is contained within orientations?
    tokens_full.rename(columns={'file_name': 'outlet'}, inplace=True)  # Rename file_name column to outlet
    tokens = pd.merge(tokens_full, orientations, left_on='outlet', right_on='source (Master List)', how='left')
    tokens = tokens[['outlet', 'content', 'Media Bias/Fact Check Label', 'id']]
    tokens.rename(columns={'Media Bias/Fact Check Label': 'label'}, inplace=True)
    tokens['label'].fillna('Unknown', inplace=True)
    return tokens


In [None]:
# Extract top words and scores from EMFD
def get_top_words_scores(emfd):

    scope = [
        'word', 'care.virtue', 'fairness.virtue', 'loyalty.virtue', 'authority.virtue', 'sanctity.virtue',
        'care.vice', 'fairness.vice', 'loyalty.vice', 'authority.vice', 'sanctity.vice', 'foundation'
    ]

    emfd_scope = emfd[scope]
    top_words_scores_dict = {}

    for col in emfd_scope.columns[1:-1]:
        words = emfd_scope.loc[emfd_scope[col] != 0, ['word', col]]
        top_words_scores_dict[col] = list(zip(words['word'], words[col]))

    return top_words_scores_dict


In [None]:
# Load threat words from file
def load_threat_words(filepath):
    with open(filepath, 'r') as file:
        threats = [line.strip() for line in file.readlines()]
    return threats


In [None]:
# Get threat embeddings using BERT
def get_threat_embeddings(threats):
    model = BertModel.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokens = tokenizer(threats, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**tokens)
        last_hidden_states = outputs.last_hidden_state
    return last_hidden_states.mean(dim=1).numpy()


In [None]:
# Load datasets
def load_data():
    orientations = pd.read_csv(dataset_folder+'labels.csv')
    emfd = pd.read_csv(dataset_folder+'emfd_amp.csv')
    top_words_scores_dict = get_top_words_scores(emfd)
    threats = load_threat_words(dataset_folder+'threat_corpus.txt')
    # # Is this a csv file or a pickle file?
    # year_dataset = pd.read_csv(project_folder+f'{year}_processed.csv')
    return orientations, emfd, top_words_scores_dict, threats


In [None]:
# Group processed text by media bias label
def group_text_by_label(tokens):
    grouped = tokens.groupby('Media Bias/Fact Check Label')['content'].apply(list)
    return grouped[grouped.index != 'Unknown']


In [None]:
# Process string into a list of words
def process_string(input_string):
    return str(input_string).split()
    # return [word.strip() for word in input_string.split(' ')]


In [None]:
from collections import Counter

# get the proportion from the article level
# look at freq for moral foundations

def get_overall_proportion(target_list, df):
    # Convert target_list to a set for faster membership checking
    target_set = set(target_list)

    # Initialize lists to store results
    threat_proportional_scores = []
    total_word_counts = []

    for content in df['content'].astype(str):
        # Count the occurrences of each word in the current row's content
        counter = Counter(content.split())

        # Calculate the total number of words in the current row's content
        total_words = sum(counter.values())

        # Calculate the number of occurrences of words from target_list in the current row's content
        target_list_occurences = sum(counter[word] for word in target_set if word in counter)

        # Calculate the proportion for the current row
        if total_words > 0:
            proportion = target_list_occurences / total_words
        else:
            proportion = 0  # Handle division by zero if the sublist is empty

        # Append results to lists
        threat_proportional_scores.append(proportion)
        total_word_counts.append(total_words)

    # Assign lists to new columns in the DataFrame
    df['threat_proportional_score'] = threat_proportional_scores
    df['total_word_count'] = total_word_counts

    return df


In [None]:
# Visualize data
def calculate_average_and_ci(data):
    means = np.mean(data, axis=0)
    conf_int = t.interval(0.95, len(data) - 1, loc=means, scale=sem(data, axis=0))
    return means, conf_int

def visualize_data(result_dict, label_mapping, year):
    for graph_key, smaller_dict in result_dict.items():
        x_labels = list(smaller_dict.keys())
        averages = []
        conf_intervals = []
        for key, values in smaller_dict.items():
            average, ci = calculate_average_and_ci(values)
            averages.append(average)
            conf_intervals.append(np.diff(ci) / 2)
        mapped_x_labels = [label_mapping[int(label)] for label in x_labels]
        fig, ax = plt.subplots()
        ax.bar(mapped_x_labels, averages, yerr=np.array(conf_intervals).T, capsize=5, align='center', alpha=0.7)
        plt.xlabel('Mapped Values')
        plt.ylabel('Average')
        plt.title(f'Bar Plot for Graph: {graph_key}')
        plt.savefig(project_folder + f'figures/threat_{graph_key}_distribution_{year}.png')
        plt.show()


In [None]:
def label_to_text_dict_creator(dataframe):
    split_strings = dataframe.astype(str).str.split()
    # used to use iteritems, use items
    return split_strings.to_dict()

In [None]:
# Main function

years = ['2019', '2020', '2021', '2022']
def main():

    # Depending on size of dataset, might be inefficient to open dataset in its
    # entirety. Might need to read in chunks? Depends on how long it takes to
    # read in pickle/csv file.

    # Load in datasets
    orientations, emfd, top_words_scores_dict, threats = load_data()

    for year in years:
        year_dataset = pd.read_csv(dataset_folder+f'combined_data_preprocessed_{year}_lemma.csv')

        processed_df = process_labels(year_dataset, orientations)

        final_df = get_overall_proportion(threats, processed_df)

        final_df.to_csv(dataset_folder+f'threat_data_score_{year}.csv', index=True, header=True)

        orientation_averages = {'left':0, 'left_center':0, 'center':0, 'right_center':0, 'right':0,
        'pro-science':0, 'conspiracy_pseudoscience':0, 'questionable_source':0, 'satire':0}

        for key in orientation_averages:
          total_word_count = final_df[final_df['label'] == key]['total_word_count'].sum()
          if total_word_count != 0:
              orientation_averages[key] = final_df[final_df['label'] == key]['threat_proportional_score'].sum() / total_word_count
          else:
              orientation_averages[key] = 0

        df = pd.DataFrame(list(orientation_averages.items()), columns=['label', 'average_score'])

        df.to_csv(dataset_folder+f'orientation_threat_avg_{year}.csv', index=True, header=True)
        # visualize_data(result2022, label_mapping)

if __name__ == "__main__":
    main()
