In [59]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from bertopic import BERTopic

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline

from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [16]:
# Load sample dataset
newsgroups_data = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
documents = newsgroups_data.data[:500]  # Limit to 500 documents for faster computation

In [19]:
df = pd.DataFrame(documents, columns=["text"])

## Comparing Sentiment

### Vader

In [20]:
# Initialize VADER Sentiment Analyzer
vader_analyzer = SentimentIntensityAnalyzer()
df["vaderSent"] = df.text.apply(lambda x: vader_analyzer.polarity_scores(x))

### BERT

In [23]:
df["trunc_text"] = df.text.apply(lambda x: x[:500])

In [25]:
bert_classifier = pipeline('sentiment-analysis')
bert_scores = bert_classifier(df.trunc_text.tolist())

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [26]:
df["bertSent"] =  bert_scores

### Check the correlation between the two of these sentiment analyses pipelines

In [None]:
# your code

Find an examples where the two sentiment classifications are opposing. Why might this be the case?

# Topic Modeling

### Latent Dirichlet Allocation

In [None]:
num_topics = 10

In [46]:
list_stopwords = stopwords.words("english")
porter = PorterStemmer()
def process_step(one_str):   
    nostop_listing = [word for word in wordpunct_tokenize(one_str)
                      if word not in list_stopwords]
    clean_listing = [porter.stem(word) for word in nostop_listing
                    if word.isalpha() 
                    and len(word) > 3]
    clean_listing_str = " ".join(clean_listing)
    return(clean_listing_str)
    
df["text_proc"] = df.text.apply(process_step)

In [49]:
# Preprocess data
documents_clean = [gensim.utils.simple_preprocess(doc) for doc in df.text_proc]
dictionary = corpora.Dictionary(documents_clean)
corpus = [dictionary.doc2bow(doc) for doc in documents_clean]

In [51]:
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary, random_state=42)

In [52]:
topics = lda_model.print_topics(num_words = 10)

for topic in topics:
    print(topic)


(0, '0.009*"would" + 0.004*"year" + 0.004*"also" + 0.004*"know" + 0.003*"think" + 0.003*"includ" + 0.003*"they" + 0.003*"right" + 0.003*"time" + 0.003*"mean"')
(1, '0.007*"would" + 0.005*"peopl" + 0.004*"use" + 0.004*"well" + 0.004*"argument" + 0.004*"like" + 0.004*"thi" + 0.004*"know" + 0.003*"scsi" + 0.003*"look"')
(2, '0.006*"would" + 0.005*"like" + 0.005*"peopl" + 0.004*"armenian" + 0.004*"time" + 0.003*"system" + 0.003*"think" + 0.003*"use" + 0.003*"know" + 0.003*"make"')
(3, '0.005*"use" + 0.004*"know" + 0.004*"would" + 0.003*"thi" + 0.003*"find" + 0.003*"come" + 0.003*"time" + 0.003*"father" + 0.003*"want" + 0.003*"window"')
(4, '0.004*"year" + 0.004*"know" + 0.004*"want" + 0.004*"would" + 0.004*"use" + 0.003*"time" + 0.003*"like" + 0.003*"thing" + 0.003*"window" + 0.003*"even"')
(5, '0.006*"would" + 0.005*"think" + 0.004*"know" + 0.004*"peopl" + 0.004*"like" + 0.004*"much" + 0.004*"thing" + 0.004*"thi" + 0.003*"true" + 0.003*"armenian"')
(6, '0.006*"use" + 0.005*"anyon" + 0.004

In [61]:
### visualize
lda_display = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_display)







## BERTopic

In [65]:
documents = newsgroups_data.data[:2000]

In [66]:
%%time
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(documents)



  self._all_finite = is_finite(X)


CPU times: user 5min 59s, sys: 26.1 s, total: 6min 25s
Wall time: 1min 2s


In [67]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,631,-1_the_to_of_and,"[the, to, of, and, is, in, for, it, that, maxa...",[The following are my thoughts on a meeting th...
1,0,190,0_the_he_in_to,"[the, he, in, to, team, and, game, was, his, p...",[1992-93 Los Angeles Kings notes.\n-----------...
2,1,144,1_is_of_the_that,"[is, of, the, that, to, not, and, it, in, jesus]","[My last article included this quote:\n\n ""If ..."
3,2,105,2_dos_for_good_excellent,"[dos, for, good, excellent, offer, the, 150, w...",[Forsale:\n\nProton P1100 preamplifier\n======...
4,3,88,3_of_the_it_is,"[of, the, it, is, to, in, that, and, are, you]",[\nAll true. And all good points.\n\n\n\nWell...
5,4,70,4_the_of_israel_to,"[the, of, israel, to, in, that, and, is, it, by]","[\nSize of armies, duration, numbers of casual..."
6,5,61,5_the_to_clipper_encryption,"[the, to, clipper, encryption, chip, be, key, ...","[After reading the debate over the Clipper, I ..."
7,6,61,6_deletion_huh_david_,"[deletion, huh, david, , , , , , , ]","[David\n\n\n, \nHuh?, \n(Deletion)\n ]"
8,7,52,7_space_the_and_nasa,"[space, the, and, nasa, of, for, to, shuttle, ...",[Archive-name: space/new_probes\nLast-modified...
9,8,48,8_scsi_tape_the_with,"[scsi, tape, the, with, drive, this, adaptec, ...","[From article <1qq7i1INNdqc@dns1.NMSU.Edu>, by..."


## Your turn

#### Load up one of the text documents from the shared class folder
#### Consider sampling to reduce the size of the dataset
#### Apply the four techniques to the dataset, and comment on what you find!