In [144]:
data_directory = "../data/"

In [145]:
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')  
nltk.download('omw-1.4')  
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer


# remove stopwords, punctuation, and normalize the corpus
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def preprocess(transcript):
    stop_free = " ".join([i for i in transcript.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [146]:
import gensim
from gensim import corpora


def lsa_modeling(data, num_topics=10):
    dictionary = corpora.Dictionary(data)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in data]
    lsa = gensim.models.LsiModel(doc_term_matrix, num_topics=num_topics, id2word = dictionary)
    return lsa

In [147]:
def lda_modeling(data, num_topics=10):
    dictionary = corpora.Dictionary(data)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in data]
    lda = gensim.models.LdaModel(doc_term_matrix, num_topics=num_topics, id2word = dictionary, passes=15)
    return lda

In [148]:
def topic_modeling(transcript, num_topics=10):
    data = [transcript.split()]
    lsa = lsa_modeling(data, num_topics)
    lda = lda_modeling(data, num_topics)
    return lsa, lda

In [149]:
import json


def get_transcripts(file):
    with open(file, 'r') as f:
        data = json.load(f)
    transcript = data['video_transcript']
    return transcript

In [150]:
import os

for file in os.listdir(data_directory):
    if file.endswith(".json"):
        transcript = get_transcripts(data_directory + file)
        transcript = preprocess(transcript)
        print(transcript)
        lsi, lda = topic_modeling(transcript)
        video_id = file.split('.')[0]
        print(video_id)
        print(lsi.print_topics())

find president libel well see mean martha thanks eric decision were waiting content it 92 page long price judge engoron found former president donald trump libel case civil case money criminality rewind little bit here there lot different case keep track on case letitia james said running attorney general going get trump well play said decision in 92 page long james asked 370 million damage want prevented business new york state wait download get look number soon get it well tell is something former president watching closely layer judicial date appointment decision coming middle south carolina primary coming february 24 super tuesday interwoven former president trump want bring ari fleischer former white house press secretary fox news contributor shannon breen chief legal correspondent shannon let start you surprise obviously found libel judge found libel case began correct correct jetted bottom see could find there several big number there judge talk finding donald trump donald trump

In [155]:
from gensim.models import LsiModel
import pandas as pd

def get_comments_corpus(file, cluster):
    df = pd.read_csv(file)
    comments_corpus = []
    for index, row in df.iterrows():
        if row['cluster'] == cluster:
            comments_corpus.append(preprocess(row['comment']).split())  # Preprocess the comment and append to corpus
    return comments_corpus

cluster_directory = "../sentiment_data/sentiment_cluster/"
for file in os.listdir(cluster_directory):
    # if file.endswith("positive.csv") or file.endswith("negative.csv"):
    df = pd.read_csv(cluster_directory + file)
    for cluster in df['cluster'].unique():
        comments_corpus = get_comments_corpus(cluster_directory + file, cluster)
        
        # Create dictionary and corpus
        dictionary = corpora.Dictionary(comments_corpus)
        corpus = [dictionary.doc2bow(comment) for comment in comments_corpus]
        
        # Train LSI model
        lsi = LsiModel(corpus, id2word=dictionary, num_topics=1)  # Adjust num_topics as needed
        
        # Print topics for each cluster
        print("Topics for File:", file, "Cluster:", cluster)
        print(lsi.print_topics(num_words=5))
        

Topics for File: CNN-Judge orders Trump to pay $355 million in civil fraud trial_gemini_2_cluster_positive.csv Cluster: 0
[(0, '-0.562*"karma" + -0.275*"existence" + -0.275*"fate" + -0.141*"person" + -0.138*"hinduism"')]
Topics for File: CNN-Judge orders Trump to pay $355 million in civil fraud trial_gemini_2_cluster_positive.csv Cluster: 1
[(0, '-0.430*"world" + -0.410*"light" + -0.356*"god" + -0.248*"come" + -0.246*"son"')]
Topics for File: Fox News-Trump ordered to pay $364M, found liable in civil fraud trial_gemini_clustered.csv Cluster: 0
[(0, '0.587*"trump" + 0.492*"judge" + 0.239*"people" + 0.165*"many" + 0.158*"court"')]
Topics for File: Fox News-Trump ordered to pay $364M, found liable in civil fraud trial_gemini_clustered.csv Cluster: 1
[(0, '0.860*"trump" + 0.216*"president" + 0.149*"people" + 0.115*"2024" + 0.091*"get"')]
Topics for File: Fox News-Trump ordered to pay $364M, found liable in civil fraud trial_gemini_clustered.csv Cluster: 2
[(0, '0.831*"trump" + 0.215*"milli