In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import re
from collections import defaultdict

In [None]:
# Address to project folder
project_folder = "/content/drive/MyDrive/2024SUDSProject/Honor/"
dataset_folder = "/content/drive/MyDrive/2024SUDSProject/datasets/"

In [None]:
# Load honor dictionary
honor_dict_path = dataset_folder+'Honor Dictionary English_2017.dic'
with open(honor_dict_path, 'r', encoding='utf-8') as file:
    honor_dict_lines = file.readlines()

labels_df = pd.read_csv(dataset_folder+'labels.csv')

tqdm.pandas()

In [None]:
# Declaring labels and variables

# Define categories for analysis
all_categories = [
    'Overall Honor Dictionary', 'Honor Gain', 'Honor gain: Virtuous Behavior',
    'Virtuous Behavior: Moral Integrity', 'Virtuous Behavior: Faith',
    'Virtuous Behavior: Manners', 'Honor Gain: Achieve/Status',
    'A/S: Achieve', 'A/S: Status', 'Honor Protection', 'Honor Protection: Public Image',
    'Honor Protection: Strength/Bravery', 'Honor Protection: Prevention',
    'Honor Protection: Sex', 'Honor Loss', 'Honor Loss: Wrongdoing',
    'Honor Loss: Harm', 'Honor Loss: Aggression', 'Honor Contexts',
    'Honor Contexts: Self', 'Honor Contexts: Military', 'Honor Contexts: Business',
    'Honor Contexts: Ethnic', 'Honor Contexts: Social', 'Honor Contexts: Gender',
    'Honor Contexts: Family', 'Honor Contexts: Government', 'General Honor'
]

label_order = [
    'left', 'left_center', 'center', 'right_center', 'right',
    'conspiracy_pseudoscience', 'questionable_source', 'satire', 'pro-science'
]

years = ['2017', '2018', '2019', '2020', '2021', '2022']

In [None]:
# Parse category definitions
category_definitions = {}
for line in honor_dict_lines:
    if line.startswith('%'):
        continue
    parts = line.split('\t')
    if parts[0].isdigit():
        category_definitions[parts[0]] = parts[1].strip()

# Parse honor words and their categories
honor_words = {}
for line in honor_dict_lines:
    if line.startswith('%') or line.split('\t')[0].isdigit():
        continue
    parts = line.split('\t')
    word = parts[0].strip().replace('*', '')
    categories = [category_definitions[num.strip()] for num in parts[1:] if num.strip() in category_definitions]
    honor_words[word] = categories

In [None]:
from collections import Counter
def count_honor_words(article):

    if not isinstance(article, str):
        return pd.Series({category: 0 for category in all_categories})

    counted_words = Counter(article.split())
    total_word_count = sum(counted_words.values(), 0.0)

    for key in counted_words:
        counted_words[key] /= total_word_count

    category_counts = {category: 0 for category in all_categories}

    for word, count in counted_words.items():
        if word in honor_words:
            for category in honor_words[word]:
                category_counts[category] += 1 * count

    return pd.Series(category_counts)


In [None]:
def label_political_leaning_dataframe(temp_df):
    temp_df.rename(columns={'file_name': 'outlet'}, inplace=True)  # Rename file_name column to outlet

    temp_df['label']  = temp_df['outlet'].map(labels_df.set_index('source (Master List)')['Media Bias/Fact Check Label'])

    # Drop rows where label is NaN
    temp_df = temp_df.dropna(subset=['label'])

    temp_df.loc[:, 'label']  = temp_df['label'].str.replace('left-bias', 'left').str.replace('right-bias', 'right')
    temp_df.loc[:, 'label'] = temp_df['label'].str.replace('left-center', 'left_center').str.replace('right-center-bias', 'right_center')
    temp_df.loc[:, 'label'] = temp_df['label'].str.replace('-', '_')


In [None]:
def visualize_data(temp_df, year):
    for category in all_categories:
        sns.catplot(x='label', y=category, data=temp_df, kind='bar', order=label_order, errorbar=('ci', 95))
        plt.title(f'{year} {category}')
        plt.xticks(rotation=90)
        category_safe = category.replace('/', '-').replace(':', '-')# Load, process, and visualize multi-year data

In [None]:
# Load, process, and visualize multi-year data
years = ['2019', '2020', '2021', '2022']
for year in years:
    temp_df = pd.read_csv(dataset_folder+f'combined_data_preprocessed_{year}_stem.csv')

    # Apply the honor word counting function to the DataFrame
    honor_score_series = temp_df['content'].progress_apply(count_honor_words)
    temp_df = pd.concat([temp_df['file_name'], temp_df['id'], honor_score_series], axis=1)

    label_political_leaning_dataframe(temp_df)

    # visualize_data(temp_df, year)

    # Save the DataFrame with honor scores to a CSV file
    temp_df.to_csv(dataset_folder+f'honor_score_{year}.csv', index=False)