## Topic Modeling (before 2022)

In [2]:
import os
import pandas as pd
import pickle

In [3]:
!pwd

/Users/nazmulahasan/Desktop/Professional/Data Science Course/Data-Science-Social-Justice-main/notebooks/module02


In [4]:
os.chdir("../../data")

In [6]:
# Import dataset before 2022
df = pd.read_csv('df_b2022.csv')
print(df.shape)

(11846, 15)


In [7]:
len(df)

11846

In [8]:
import spacy
nlp = spacy.load('en_core_web_sm')
from gensim.models.phrases import Phrases, Phraser

In [9]:
def clean(token):
    """Helper function that specifies whether a token is:
        - punctuation
        - space
        - digit
    """
    return token.is_punct or token.is_space or token.is_digit

def line_read(df, text_col='selftext'):
    """Generator function to read in text from df and get rid of line breaks."""    
    for text in df[text_col]:
        yield text.replace('\n', '')

def preprocess(df, text_col='selftext', allowed_postags=['NOUN', 'ADJ']):
    """Preprocessing function to apply to a dataframe."""
    for parsed in nlp.pipe(line_read(df, text_col), batch_size=1000, disable=["tok2vec", "ner"]):
        # Gather lowercased, lemmatized tokens
        tokens = [token.lemma_.lower() if token.lemma_ != '-PRON-'
                  else token.lower_ 
                  for token in parsed if not clean(token)]
        # Remove specific lemmatizations, and words that are not nouns or adjectives
        tokens = [lemma
                  for lemma in tokens
                  if not lemma in ["'s",  "’s", "’"] and not lemma in allowed_postags]
        # Remove stop words
        tokens = [token for token in tokens if token not in spacy.lang.en.stop_words.STOP_WORDS]
        yield tokens

In [10]:
docs = [line for line in preprocess(df, text_col='selftext')]

In [11]:
# Create bigram model: pass docs into Phrases class
bigrams = Phrases(docs, min_count=20, threshold=300)
# Create a "frozen" bigram model using the Phraser class
bigram_phraser = Phraser(bigrams)
# Now, create bigrams 
docs_bigrams = [bigram_phraser[doc] for doc in docs]

In [12]:
trigrams = Phrases(bigrams[docs], min_count=20, threshold=100)  
trigram_phraser = Phraser(trigrams)
docs_trigrams = [trigram_phraser[doc] for doc in docs_bigrams]

In [13]:
# Let's save the data to an external JSON file:

import json
with open('b2022.json', 'w') as write:
    json.dump(docs_trigrams, write)
# Opening the same file works as follows:
with open("b2022.json") as f:
    trigrams = json.load(f)

### Constructing a Word2Vec Model

In [14]:
from gensim.models import Word2Vec
import multiprocessing

In [15]:
# Count the number of cores you have at your disposal
cores = multiprocessing.cpu_count()
# Word vector dimensionality (how many features each word will be given)
n_features = 300
# Minimum word count to be taken into account
min_word_count = 10
# Number of threads to run in parallel (equal to your amount of cores)
n_workers = cores
# Context window size
window = 5
# Downsample setting for frequent words
downsampling = 1e-2
# Seed for the random number generator (to create reproducible results)
seed = 1 
# Skip-gram = 1, CBOW = 0
sg = 1
epochs = 20

model = Word2Vec(
    sentences=trigrams,
    workers=n_workers,
    vector_size=n_features,
    min_count=min_word_count,
    window=window,
    sample=downsampling,
    seed=seed,
    sg=sg)

In [16]:
model.train(trigrams, total_examples=model.corpus_count, epochs=10)        
model.save('b2022.emb')
model = Word2Vec.load('b2022.emb')

In [17]:
len(model.wv)

4909

In [18]:
model.wv.index_to_key[0]

'ukraine'

In [19]:
model.wv.vectors[0]

array([ 0.15403882,  0.04031067,  0.02568011,  0.22978565, -0.13992351,
       -0.03571025,  0.20262171,  0.25462577, -0.11586468,  0.00857312,
        0.12236355, -0.1304633 ,  0.04779146,  0.00175683, -0.32894126,
        0.09279057,  0.13196717, -0.11381891, -0.02869826,  0.0037834 ,
        0.23155825,  0.18776171,  0.11029074, -0.01062969,  0.24932438,
        0.00096621,  0.01708085,  0.07962421, -0.1291796 , -0.12869659,
       -0.08378004,  0.08210211,  0.01773398,  0.0579184 ,  0.03020946,
        0.10272977,  0.2602234 , -0.18516208,  0.06021745,  0.08220214,
       -0.13176696,  0.0999034 ,  0.28701183, -0.12761204, -0.3432032 ,
        0.0447849 , -0.13595884,  0.03219929, -0.03109097, -0.04504774,
       -0.02317126,  0.01670879, -0.13747138, -0.01937179, -0.06910601,
        0.1151434 ,  0.10549378, -0.1641999 ,  0.02708152,  0.1063932 ,
       -0.0574516 ,  0.14121765, -0.03975619,  0.08553116, -0.07378018,
       -0.13301857, -0.18220747, -0.01013382, -0.08628427, -0.10

### Word Similarity

In [21]:
def get_most_similar_terms(model, token, topn=20):
    """Look up the top N most similar terms to the token."""
    for word, similarity in model.wv.most_similar(positive=[token], topn=topn):
        print(f"{word}: {round(similarity, 3)}")

In [37]:
get_most_similar_terms(model, 'russia')

wwiii: 0.506
pathetic: 0.492
escalating: 0.49
sanctioned: 0.49
invading: 0.488
threaten: 0.481
establish: 0.48
aggressor: 0.475
invades: 0.475
halt: 0.472
agrees: 0.468
iran: 0.465
objectives: 0.463
powers: 0.461
merely: 0.461
demands: 0.46
imposed: 0.459
stance: 0.458
republics: 0.455
refusal: 0.454


In [37]:
get_most_similar_terms(model, 'ukraine')

aggressor: 0.547
weaken: 0.543
pretext: 0.542
justify: 0.541
promises: 0.526
view_poll: 0.526
threaten: 0.525
interfere: 0.523
ukraine?i: 0.521
liberate: 0.52
unprovoked: 0.514
punish: 0.512
imperialism: 0.507
sovereignty: 0.506
lunatic: 0.505
palestine: 0.5
facto: 0.495
alongside: 0.493
peacekeeping: 0.493
peacekeepers: 0.493


In [38]:
get_most_similar_terms(model, 'crimea')

annexation: 0.633
annexed: 0.598
dnr: 0.57
crimean: 0.557
donbass: 0.523
peninsula: 0.519
separatist: 0.511
tatars: 0.507
donetsk_luhansk: 0.496
territories: 0.484
luhansk: 0.478
seize: 0.468
occupied: 0.466
rebel: 0.458
donbas: 0.452
territory: 0.45
recognize: 0.448
bridge: 0.444
boris: 0.439
black_sea: 0.438


In [39]:
get_most_similar_terms(model, 'putin')

bs: 0.506
escalating: 0.504
justification: 0.501
dare: 0.5
lunatic: 0.498
madness: 0.495
committing: 0.492
bully: 0.491
dictatorship: 0.489
russophobia: 0.487
wwiii: 0.483
winning: 0.477
nuke: 0.47
dictator: 0.468
kgb: 0.468
coup: 0.462
lie: 0.462
presidents: 0.458
cronies: 0.457
fascist: 0.455


In [40]:
get_most_similar_terms(model, 'migration')

czech_republic: 0.674
eligible: 0.635
iceland: 0.634
slovenia: 0.622
galicia: 0.621
bulgarian: 0.616
applies: 0.615
emigrated: 0.615
freely: 0.614
estonia: 0.611
reside: 0.609
bachelor: 0.605
slovakia: 0.604
armenia: 0.601
romanians: 0.593
marriage: 0.591
socialist: 0.59
biometric: 0.588
papers: 0.588
basis: 0.585


In [42]:
get_most_similar_terms(model, 'zelenskyy')

volodymyr: 0.694
comedian: 0.66
elected: 0.634
actor: 0.63
rant: 0.618
inspiring: 0.612
politician: 0.606
zelensky: 0.592
tymoshenko: 0.571
communists: 0.571
presidential: 0.566
votes: 0.566
fascism: 0.562
vladimir: 0.556
speech: 0.544
warrior: 0.541
delusional: 0.54
presidency: 0.539
president: 0.539
imminent: 0.538


In [40]:
get_most_similar_terms(model, 'travel')

questions:1: 0.532
travelled: 0.493
pfizer: 0.492
bukovel: 0.486
vaccine: 0.486
biometric: 0.486
schengen: 0.48
chisinau: 0.479
travelers: 0.472
traveling: 0.468
duration: 0.466
documentation: 0.455
antigen: 0.454
vinnytsia: 0.45
renew: 0.449
itinerary: 0.446
residence: 0.443
motorcycle: 0.441
layover: 0.439
solo: 0.438


### Visualizing High Dimensional Spaces with 𝑡 -SNE

In [25]:
from sklearn.manifold import TSNE

In [26]:
# Create some filepaths to save our model
tsne_path = 'tsne_model'
tsne_vectors_path = 'tsne_vectors.pkl'

In [27]:
tsne = TSNE(init='pca', learning_rate='auto')
tsne_vectors = tsne.fit_transform(model.wv.vectors)



In [29]:
# Store the t-SNE vectors
tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(model.wv.index_to_key),
                            columns=['x', 'y'])

In [30]:
with open(tsne_path, 'wb') as f:
    pickle.dump(tsne, f)

tsne_vectors.to_pickle(tsne_vectors_path)

In [31]:
with open(tsne_path, 'rb') as f:
    tsne = pickle.load(f)
    
tsne_vectors = pd.read_pickle(tsne_vectors_path)

In [32]:
import bokeh
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource

output_notebook()
bokeh.io.output_notebook()

In [33]:
# Add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# Create the plot and configure the title, dimensions, and tools
tsne_plot = figure(title='t-SNE Word Embeddings',
                   plot_width=800,
                   plot_height=800)

# Add a hover tool to display words on roll-over
tsne_plot.add_tools(HoverTool(tooltips='@index') )

# Draw the words as circles on the plot
tsne_plot.circle('x', 'y',
                 source=plot_data,
                 color='blue',
                 line_alpha=0.2,
                 fill_alpha=0.1,
                 size=10,
                 hover_line_color='black')

# Configure visual elements of the plot
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# Engage!
show(tsne_plot)