In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # Tokenize text
    text = str(text)
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stopwords, and lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word not in string.punctuation]
    return tokens

In [5]:
worldnews = pd.read_csv('comments_filtered/worldnews_comments.csv')

# Extract the specific part of the URL from the 'link' column
pattern = r'/r/worldnews/comments/\w+/([^/]+/\w+)'

worldnews['og_post'] = worldnews['link'].str.extract(pattern, expand=False)
worldnews['og_post'] = worldnews['og_post'].str.replace('_', ' ')

# Remove rows where no match was found
worldnews = worldnews.dropna(subset=['og_post'])

worldnews['full_text'] = worldnews['og_post'] + ' ' + worldnews['body']

# Apply preprocessing to the dataset
worldnews['full_text'] = worldnews['full_text'].apply(preprocess)

  worldnews = pd.read_csv('comments_filtered/worldnews_comments.csv')


In [None]:
from gensim.corpora import Dictionary

# Create a dictionary from the processed text
dictionary = Dictionary(worldnews['full_text'])

# Convert the text to a bag-of-words format (word counts)
corpus = [dictionary.doc2bow(text) for text in worldnews['full_text']]

print(corpus)

In [7]:
from gensim.models import LdaModel

# Train the LDA model with 50 topics
lda_model = LdaModel(
    corpus,
    num_topics=50,
    id2word=dictionary,
    passes=10,
)

# Print the top 10 words for each topic
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.087*"russian" + 0.084*"live" + 0.069*"thread" + 0.067*"invasion" + 0.065*"rworldnews"')
(1, '0.100*"x" + 0.069*"refuse" + 0.067*"woman" + 0.034*"survivor" + 0.034*"auschwitz"')
(2, '0.123*"removed" + 0.064*"israel" + 0.047*"hamas" + 0.034*"jew" + 0.029*"gaza"')
(3, '0.029*"attack" + 0.017*"u" + 0.016*"air" + 0.015*"missile" + 0.015*"mongolia"')
(4, '0.077*"brazil" + 0.057*"ban" + 0.039*"japan" + 0.031*"china" + 0.017*"alone"')
(5, '0.022*"\'s" + 0.020*"n\'t" + 0.018*"ukraine" + 0.017*"russia" + 0.011*"would"')
(6, '0.075*"holocaust" + 0.031*"france" + 0.030*"germany" + 0.019*"eu" + 0.015*"state"')
(7, '0.141*"musk" + 0.063*"korean" + 0.061*"memorial" + 0.059*"defaced" + 0.039*"korea"')
(8, '0.070*"berlin" + 0.068*"http" + 0.034*"president" + 0.024*"reddit" + 0.023*"vladimir"')
(9, '0.086*"’" + 0.014*"people" + 0.012*"“" + 0.012*"”" + 0.011*"right"')


In [17]:
# write topics to txt file
import re
topics = lda_model.print_topics(num_words=10)
with open('topics.txt', 'w') as f:
    for topic_id, topic in topics:
        words = [re.sub(r'\d+\.\d+\*?', '', word).strip() for word in re.split(r'[+*]', topic) if word.strip()]
        f.write(f"{topic_id}, {', '.join(words)}\n")
