## Topic Modeling (after 2022)

In [53]:
import os
import pandas as pd
import pickle

In [3]:
!pwd

/Users/nazmulahasan/Desktop/Professional/Data Science Course/Data-Science-Social-Justice-main/notebooks/module02


In [4]:
os.chdir("../../data")

In [54]:
# Import dataset before 2022
df = pd.read_csv('df_a2022.csv')
print(df.shape)

(6515, 15)


In [55]:
len(df)

6515

In [56]:
import spacy
nlp = spacy.load('en_core_web_sm')
from gensim.models.phrases import Phrases, Phraser

In [57]:
def clean(token):
    """Helper function that specifies whether a token is:
        - punctuation
        - space
        - digit
    """
    return token.is_punct or token.is_space or token.is_digit

def line_read(df, text_col='selftext'):
    """Generator function to read in text from df and get rid of line breaks."""    
    for text in df[text_col]:
        yield text.replace('\n', '')

def preprocess(df, text_col='selftext', allowed_postags=['NOUN', 'ADJ']):
    """Preprocessing function to apply to a dataframe."""
    for parsed in nlp.pipe(line_read(df, text_col), batch_size=1000, disable=["tok2vec", "ner"]):
        # Gather lowercased, lemmatized tokens
        tokens = [token.lemma_.lower() if token.lemma_ != '-PRON-'
                  else token.lower_ 
                  for token in parsed if not clean(token)]
        # Remove specific lemmatizations, and words that are not nouns or adjectives
        tokens = [lemma
                  for lemma in tokens
                  if not lemma in ["'s",  "’s", "’"] and not lemma in allowed_postags]
        # Remove stop words
        tokens = [token for token in tokens if token not in spacy.lang.en.stop_words.STOP_WORDS]
        yield tokens

In [58]:
docs = [line for line in preprocess(df, text_col='selftext')]

In [59]:
# Create bigram model: pass docs into Phrases class
bigrams = Phrases(docs, min_count=20, threshold=300)
# Create a "frozen" bigram model using the Phraser class
bigram_phraser = Phraser(bigrams)
# Now, create bigrams 
docs_bigrams = [bigram_phraser[doc] for doc in docs]

In [60]:
trigrams = Phrases(bigrams[docs], min_count=20, threshold=100)  
trigram_phraser = Phraser(trigrams)
docs_trigrams = [trigram_phraser[doc] for doc in docs_bigrams]

In [61]:
# Let's save the data to an external JSON file:

import json
with open('a2022.json', 'w') as write:
    json.dump(docs_trigrams, write)
# Opening the same file works as follows:
with open("a2022.json") as f:
    trigrams = json.load(f)

### Constructing a Word2Vec Model

In [63]:
from gensim.models import Word2Vec
import multiprocessing

In [64]:
# Count the number of cores you have at your disposal
cores = multiprocessing.cpu_count()
# Word vector dimensionality (how many features each word will be given)
n_features = 300
# Minimum word count to be taken into account
min_word_count = 10
# Number of threads to run in parallel (equal to your amount of cores)
n_workers = cores
# Context window size
window = 5
# Downsample setting for frequent words
downsampling = 1e-2
# Seed for the random number generator (to create reproducible results)
seed = 1 
# Skip-gram = 1, CBOW = 0
sg = 1
epochs = 20

model = Word2Vec(
    sentences=trigrams,
    workers=n_workers,
    vector_size=n_features,
    min_count=min_word_count,
    window=window,
    sample=downsampling,
    seed=seed,
    sg=sg)

In [65]:
model.train(trigrams, total_examples=model.corpus_count, epochs=10)        
model.save('a2022.emb')
model = Word2Vec.load('a2022.emb')

In [66]:
len(model.wv)

3393

In [69]:
model.wv.index_to_key[0]

'ukraine'

In [70]:
model.wv.vectors[0]

array([-0.11286732,  0.02318297,  0.16201086,  0.19806772, -0.2607417 ,
       -0.21206419,  0.17596273,  0.51064426,  0.07546037, -0.0386712 ,
       -0.10286175, -0.26760715,  0.1908857 ,  0.05626771, -0.21060555,
       -0.2567893 , -0.17143895,  0.11890506,  0.21027036, -0.1573991 ,
        0.253609  ,  0.03955222,  0.09953902, -0.09201608,  0.23690325,
       -0.2747531 ,  0.19681962,  0.2854803 , -0.20427658, -0.18852924,
       -0.0805032 ,  0.07863539,  0.26740685, -0.09275786, -0.09555019,
       -0.20133194,  0.14035143, -0.37230876,  0.05244995, -0.08375604,
       -0.17710797, -0.00849129, -0.20222142, -0.18366799,  0.01186204,
        0.10820254,  0.00147515,  0.15124762,  0.03632816,  0.11624506,
        0.0678018 ,  0.05620463,  0.12625216, -0.08363765, -0.00323798,
        0.12605946,  0.24680418,  0.01325781,  0.01866217,  0.09327708,
        0.12750626,  0.12991562, -0.14469546, -0.12818141, -0.2508934 ,
       -0.05595501,  0.00527939,  0.13695192, -0.06274804,  0.01

### Word Similarity

In [71]:
def get_most_similar_terms(model, token, topn=20):
    """Look up the top N most similar terms to the token."""
    for word, similarity in model.wv.most_similar(positive=[token], topn=topn):
        print(f"{word}: {round(similarity, 3)}")

In [100]:
get_most_similar_terms(model, 'russia')

wwiii: 0.515
economically: 0.515
halt: 0.506
ideology: 0.499
deleted][view: 0.497
achieve: 0.485
smart: 0.483
sees: 0.479
comparison: 0.466
cease: 0.465
pathetic: 0.463
committing: 0.462
draw: 0.46
sanction: 0.459
baltic: 0.456
genocide: 0.455
annexation: 0.455
annexed: 0.454
favor: 0.454
iran: 0.454


In [74]:
get_most_similar_terms(model, 'ukraine')

ukraina: 0.523
moves: 0.489
refusing: 0.488
justify: 0.479
inspired: 0.477
unprovoked: 0.476
temporarily: 0.458
alongside: 0.457
favor: 0.455
wanting: 0.453
pretext: 0.453
mercenaries: 0.453
donbas: 0.452
promise: 0.451
express: 0.45
ukrain: 0.448
justice: 0.448
aggressor: 0.448
capable: 0.447
sees: 0.446


In [77]:
get_most_similar_terms(model, 'crimea')

donbass: 0.748
annexation: 0.73
dnr: 0.694
annexed: 0.681
donbas: 0.664
territories: 0.625
luhansk: 0.62
attempted: 0.607
separatist: 0.602
occupied: 0.6
recognize: 0.587
republics: 0.587
donetsk_luhansk: 0.574
seize: 0.574
eastern: 0.57
separatists: 0.562
sea: 0.543
rebel: 0.534
region: 0.527
southern: 0.52


In [78]:
get_most_similar_terms(model, 'putin')

lunatic: 0.556
coup: 0.551
insane: 0.533
madness: 0.532
lie: 0.53
mad: 0.527
regime: 0.524
bs: 0.521
dictator: 0.512
justification: 0.511
genocide: 0.505
escalating: 0.504
dictatorship: 0.502
rhetoric: 0.5
dictators: 0.498
fascists: 0.497
wwiii: 0.495
nuke: 0.495
delusional: 0.492
excuse: 0.486


In [103]:
get_most_similar_terms(model, 'travel')

booked: 0.743
residence: 0.719
portugal: 0.695
flights: 0.687
persons: 0.683
permit: 0.675
lawyer: 0.674
certificate: 0.673
tourists: 0.667
hotels: 0.663
volunteering: 0.661
accommodation: 0.651
refuge: 0.651
visa: 0.647
pets: 0.646
traveling: 0.645
trains: 0.645
zhytomyr: 0.642
entry: 0.64
ireland: 0.64


In [81]:
get_most_similar_terms(model, 'zelenskyy')

volodymyr: 0.733
jewish: 0.703
elected: 0.681
politician: 0.675
speech: 0.645
vladimir: 0.639
prime_minister: 0.639
inspiring: 0.635
zelensky: 0.632
president: 0.63
trump: 0.618
holocaust: 0.617
votes: 0.614
spelling: 0.608
nazi: 0.606
supporters: 0.603
rising: 0.601
neo: 0.595
leader: 0.585
referring: 0.584


### Visualizing High Dimensional Spaces with 𝑡 -SNE

In [82]:
from sklearn.manifold import TSNE

In [83]:
# Create some filepaths to save our model
tsne_path = 'tsne_model'
tsne_vectors_path = 'tsne_vectors.pkl'

In [84]:
tsne = TSNE(init='pca', learning_rate='auto')
tsne_vectors = tsne.fit_transform(model.wv.vectors)



In [85]:
# Store the t-SNE vectors
tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(model.wv.index_to_key),
                            columns=['x', 'y'])

In [86]:
with open(tsne_path, 'wb') as f:
    pickle.dump(tsne, f)

tsne_vectors.to_pickle(tsne_vectors_path)

In [87]:
with open(tsne_path, 'rb') as f:
    tsne = pickle.load(f)
    
tsne_vectors = pd.read_pickle(tsne_vectors_path)

In [88]:
import bokeh
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource

output_notebook()
bokeh.io.output_notebook()

In [89]:
# Add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# Create the plot and configure the title, dimensions, and tools
tsne_plot = figure(title='t-SNE Word Embeddings',
                   plot_width=800,
                   plot_height=800)

# Add a hover tool to display words on roll-over
tsne_plot.add_tools(HoverTool(tooltips='@index') )

# Draw the words as circles on the plot
tsne_plot.circle('x', 'y',
                 source=plot_data,
                 color='blue',
                 line_alpha=0.2,
                 fill_alpha=0.1,
                 size=10,
                 hover_line_color='black')

# Configure visual elements of the plot
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# Engage!
show(tsne_plot)