In [None]:
# Connecting Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Importing all libraries
import pandas as pd
from tqdm import tqdm
from nltk.corpus import words
import pickle
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Address to project and dataset folder
project_folder = "/content/drive/MyDrive/2024SUDSProject/"
dataset_folder = "/content/drive/MyDrive/2024SUDSProject/datasets/"

In [None]:
def save_tokens(year, all_tokens):
    # Save all_tokens variable into pickle file
    with open(dataset_folder + f'step3_all_tokens_{year}.pkl', 'wb') as file:
        pickle.dump(all_tokens, file)

In [None]:
def save_seed_topics(year, seed_topics):
    # Save seed_topics variable into pickle file
    with open(dataset_folder + f'step3_seed_topics_{year}.pkl', 'wb') as f:
        pickle.dump(seed_topics, f)

In [None]:
def save_vectorizer(year, vectorizer):
    # Save vectorizer into pickle file
    with open(dataset_folder + f'step3_vectorizer_{year}.pkl', 'wb') as file:
        pickle.dump(vectorizer, file)

In [None]:
def save_data_vectorized(year, data_vectorized):
    # Save data_vectorized into pickle file
    with open(dataset_folder + f'step3_data_vectorized_{year}.pkl', 'wb') as file:
        pickle.dump(data_vectorized, file)

In [None]:
def save_dictionary(year, dictionary):
    # Save dictionary into pickle file
    with open(dataset_folder + f'step3_dictionary_{year}.pkl', 'wb') as file:
        pickle.dump(dictionary, file)

In [None]:
def save_bow(year, bow):
    # Save data_vectorized into pickle file
    with open(dataset_folder + f'step3_bow_{year}.pkl', 'wb') as file:
        pickle.dump(bow, file)

In [None]:
# Opens Moral Foundation Dictionary. Contains dictionary of words along with their
# correlated scores and foundation.
emfd = pd.read_csv(dataset_folder + 'emfd_amp.csv')


# Creates list of foundations such as care.virtue, care.vice, loyalty.vice, etc.
# Note: There are 5 moral foundation categories, each with a virtue and vice branch
unique_foundations = emfd["foundation"].unique().tolist()

# Compiles dictionary to find the top 10 words associated with
# each moral foundation category, stores the within moral_foundation_seed_dict

moral_foundation_seed_dict = {}

for foundation in unique_foundations:
    columns = [col for col in emfd.columns if foundation in col]
    top_words = []

    # Consider including more words to improve accuracy?
    for col in columns:
        top_words += emfd.nlargest(10, col)['word'].tolist()

    moral_foundation_seed_dict[foundation] = top_words


In [None]:
def create_id2word(all_tokens):
    # Creates dictionary that maps words to integer ids
    id2word = corpora.Dictionary(all_tokens)

    # This line is unused but may be useful later
    # corpus = [id2word.doc2bow(text) for text in tqdm(all_tokens, desc="Creating Corpus")]

    return id2word

def create_bow(dictionary, all_tokens):
    # Creates bag of words
    bow = [dictionary.doc2bow(text) for text in tqdm(all_tokens, desc="Creating Bag of Words")]

    return bow

In [None]:
def create_vectorizers(docs_strings):

    # Creates vectorizer as tool to transform data into a matrix of token frequency per sentence
    vectorizer = CountVectorizer(analyzer='word', lowercase=False)
    data_vectorized = vectorizer.fit_transform(tqdm(docs_strings, desc="Vectorizing"))

    return vectorizer, data_vectorized

def create_seed_topics(id2word):
    seed_topics = {}
    for topic_id, seed_words in tqdm(enumerate(moral_foundation_seed_dict.values()), desc="Seed Topics"):
        for word in seed_words:
            # Creates a word_id for each word in the seed_words list, checks if its exists
            # If it doesn't it returns None
            word_id = id2word.token2id.get(word, None)
            if word_id is not None:
                # Assigns the word_id to its corresponding moral foundation category num
                seed_topics[word_id] = topic_id

    return seed_topics

def open_dict(year):
    with open(dataset_folder + f'step3_dictionary_{year}.pkl', 'rb') as file:
        dictionary = pickle.load(file)
    return dictionary


In [None]:
# years = ['2017', '2018', '2019', '2020', '2021', '2022']

years = ['2019', '2020', '2021', '2022']

def doc_generator(docs):
  for doc in docs:
    yield str(doc).split()

years = ['sample']

# Iterate over chunks of the CSV file
# Stores all tokens in all_tokens
for year in years:
    # Initialize an empty list to store processed tokens
    doc_strings = []

    df = pd.read_csv(dataset_folder + f'combined_data_preprocessed_{year}_lemma.csv')

    for index, row in df.iterrows():
        doc_strings.append(str(row['content']))

    id2word = create_id2word(doc_generator(df['content']))

    bow = create_bow(id2word, doc_generator(df['content']))

    save_dictionary(year, id2word)
    save_bow(year, bow)

    seed_topics = create_seed_topics(id2word)

    vectorizer, data_vectorized = create_vectorizers(doc_strings)

    save_seed_topics(year, seed_topics)
    save_data_vectorized(year, data_vectorized)
    save_vectorizer(year, vectorizer)
    # save_tokens(year, all_tokens) removed, b/c not memory efficient
