In [1]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [2]:
"""
    Example of embedding's data structure
    embs = {
        a: [0,0,1,....,0],
        b: [1,0,0,....,0],
        ...
    }

"""

import numpy as np


def align_two_embs(emb_to_align, emb_base, common_keys=None):
    """
    :param emb_to_align: embedding vectors to be align
    :param emb_base: base embedding vectors
    :return:
        aligned_embeddings of emb_to_align
    """
    if not common_keys:
        common_keys = list(set(emb_to_align.keys()).intersection(set(emb_base.keys())))

    A = np.array([emb_to_align[key] for key in common_keys]).T
    B = np.array([emb_base[key] for key in common_keys]).T
    M = B.dot(A.T)
    u, sigma, v_t = np.linalg.svd(M)
    rotation_matrix = u.dot(v_t)
    aligned_embedding = {k: rotation_matrix.dot(v) for k, v in emb_to_align.items()}

    return aligned_embedding


def align_list_of_embs(emb_list, emb_base):
    """
    :param emb_list: list of embedding vectors to be align
    :param emb_base: base embedding vectors
    :return:
        list of aligned_embeddings
    """
    common_keys = set.intersection(*[set(emb.keys()) for emb in emb_list])
    common_keys = list(common_keys.intersection(set(emb_base.keys())))

    aligned_embeddings = []
    for emb_to_align in emb_list:
        aligned_emb = align_two_embs(emb_to_align, emb_base, common_keys)
        aligned_embeddings.append(aligned_emb)

    return aligned_embeddings

In [3]:
import numpy as np
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    """
    Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """

    # patch by Richard So [https://twitter.com/richardjeanso) (thanks!) to update this code for new version of gensim
    # base_embed.init_sims(replace=True)
    # other_embed.init_sims(replace=True)

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

    # re-filling the normed vectors
    in_base_embed.wv.fill_norms(force=True)
    in_other_embed.wv.fill_norms(force=True)

    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.wv.get_normed_vectors()
    other_vecs = in_other_embed.wv.get_normed_vectors()

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v) 
    # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
    other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)    
    
    return other_embed

def intersection_align_gensim(m1, m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.wv.index_to_key)
    vocab_m2 = set(m2.wv.index_to_key)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)
    # print(len(common_vocab))

    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.key_to_index[w] for w in common_vocab]
        old_arr = m.wv.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.wv.key_to_index = new_key_to_index
        m.wv.index_to_key = new_index_to_key
        
        print(len(m.wv.key_to_index), len(m.wv.vectors))
        
    return (m1,m2)

In [4]:
import pandas as pd
pd.options.display.max_colwidth = -1

  pd.options.display.max_colwidth = -1


In [5]:
df = pd.read_csv("../data/final_twitter_data.csv", sep='\t', lineterminator='\n', error_bad_lines=False)



  df = pd.read_csv("../data/final_twitter_data.csv", sep='\t', lineterminator='\n', error_bad_lines=False)
  df = pd.read_csv("../data/final_twitter_data.csv", sep='\t', lineterminator='\n', error_bad_lines=False)


In [6]:
df.created_at = pd.to_datetime(df.created_at)
#df.created_at = df.created_at.dt.date
df['year'] = df.created_at.dt.year
df['month'] = df.created_at.dt.month
df['day'] = df.created_at.dt.day
df['month'] = df.month.apply(lambda x: int((x-1)//3)+1)
df['date'] = df.apply(lambda x: "{}/{}/{}".format(str(x.day),str(x.month),str(x.year)),axis=1)
df['date2'] = df.apply(lambda x: "{}/{}".format(str(x.month),str(x.year)),axis=1)

df = df.sort_values(['date']).reset_index(drop=True)

In [7]:
import re

def clean_tweets(text):
    text = text.lower()
    text = re.sub(r'@\w+','',text)
    text = re.sub(r'http\S+','',text)
    text = re.sub(r'://\S+','',text)
    text = re.sub(r'#\w+','',text)
    text = re.sub(r'\d+','',text)
    return text.strip()

def remove_html(text):
    text = text.replace("\n"," ")
    pattern = re.compile('<.*?>') #all the HTML tags
    return pattern.sub(r'', text)

def remove_email(text):
    text = re.sub(r'[\w.<>]*\w+@\w+[\w.<>]*', " ", text)
    return text

def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [8]:
final_texts={}
for i in range(2014,2023):
    final_texts[i] = list(map(lambda x: clean_tweets(remove_html(remove_email(remove_emojis(x.lower())))), \
                              df[df.year <= i].text.values.tolist()))

In [9]:
from gensim.test.utils import common_texts

In [10]:
len(final_texts[2020])

76962

In [11]:
from tqdm import tqdm

In [12]:
models={}
for i in tqdm(final_texts.keys()):
    model = Word2Vec(sentences=[i.split() for i in final_texts[i]], window=5, min_count=10, workers=4)
    models[i] = model

100%|█████████████████████████████████████████████| 9/9 [00:21<00:00,  2.37s/it]


In [49]:
models[2014].vector_size

100

In [13]:
models[2014].wv.key_to_index

{'hai': 0,
 'की': 1,
 'के': 2,
 'ki': 3,
 'में': 4,
 'ke': 5,
 'salman': 6,
 'ko': 7,
 'से': 8,
 'है': 9,
 'se': 10,
 'bhi': 11,
 'को': 12,
 'to': 13,
 'का': 14,
 'nahi': 15,
 'srk': 16,
 'ka': 17,
 ':': 18,
 'aur': 19,
 'ने': 20,
 'पर': 21,
 'me': 22,
 'ho': 23,
 '-': 24,
 'और': 25,
 'bhai': 26,
 'तो': 27,
 'khan': 28,
 'मोदी': 29,
 'ne': 30,
 'kya': 31,
 'नहीं': 32,
 'ek': 33,
 'hi': 34,
 'hain': 35,
 'main': 36,
 '.': 37,
 'via': 38,
 'and': 39,
 'kar': 40,
 'in': 41,
 'भी': 42,
 'mein': 43,
 'koi': 44,
 'ye': 45,
 'सरकार': 46,
 'कर': 47,
 'na': 48,
 'liye': 49,
 'toh': 50,
 'k': 51,
 'rt': 52,
 'एक': 53,
 'kuch': 54,
 '!': 55,
 'par': 56,
 'aap': 57,
 'kabhi': 58,
 '?': 59,
 'politics': 60,
 'hai.': 61,
 '":': 62,
 'is': 63,
 'ही': 64,
 'indian': 65,
 'हैं': 66,
 'jo': 67,
 'हो': 68,
 'aaj': 69,
 'raha': 70,
 'the': 71,
 '…': 72,
 'ab': 73,
 'क्या': 74,
 'लिए': 75,
 'क्रिकेटरों': 76,
 ',': 77,
 'sab': 78,
 'of': 79,
 'pe': 80,
 'yeh': 81,
 'सच': 82,
 'on': 83,
 'baat': 84,
 'अब': 8

In [14]:
for year in range(2014,2023):
    #print (year, models[year].wv.similar_by_vector(models[year].wv['government']))
    print (year, models[year].wv.most_similar('government', topn=10))

2014 [('ki', 0.9978377223014832), ('की', 0.9978335499763489), ('hai', 0.9978311061859131), ('ke', 0.9977849721908569), ('को', 0.9977843165397644), ('salman', 0.9977703094482422), ('par', 0.9977450370788574), ('aur', 0.9977361559867859), ('se', 0.9977262020111084), ('bhai', 0.9977203011512756)]
2015 [('jab', 0.9995773434638977), ('karte', 0.9994240999221802), ('tu', 0.9993805289268494), ('har', 0.9993798136711121), ('h', 0.9993774890899658), ('din', 0.9993398785591125), ('abhi', 0.9993375539779663), ('saal', 0.9993009567260742), ('he', 0.9992976188659668), ('aa', 0.9992762207984924)]
2016 [('mujhe', 0.9995773434638977), ('sirf', 0.9995262622833252), ('mai', 0.9995003342628479), ('apne', 0.9994505643844604), ('gaya', 0.9994416832923889), ('aaj', 0.9994167685508728), ('jab', 0.9994086623191833), ('kiya', 0.999405026435852), ('hota', 0.9993815422058105), ('tum', 0.9993136525154114)]
2017 [('hai.', 0.9995414614677429), ('hai,', 0.9994375705718994), ('jo', 0.9994053840637207), ('rahi', 0.999

In [15]:
from pyvis.network import Network
import networkx as nx

In [16]:
seed_word = 'government'

all_nets = {}

for i, year in enumerate([2018,2019,2020,2021,2022]):
    
    net = Network(notebook=True)
    G = nx.Graph()

    similar_words = [i[0] for i in models[year].wv.most_similar(seed_word, topn=5)]

    G.add_node(seed_word)

    for word in similar_words:
        if word not in G.nodes():
            G.add_node(word)
            G.add_edge(seed_word,word)

        similar_words2 = [i[0] for i in models[year].wv.most_similar(word, topn=5)]
        for word2 in similar_words2:
            if word2 in G.nodes():
                G.add_edge(word, word2)
    
    all_nets[i] = G
    
    if i == 0:
        for edge in G.edges():
            net.add_node(edge[0], color='blue')
            net.add_node(edge[1], color='blue')

            net.add_edge(edge[0], edge[1])
    else:
        for node in all_nets[i-1].nodes():
            if node not in G.nodes():
                pass
                #net.add_node(node, color='red')
            else:
                net.add_node(node, color='blue')
        
        for edge in G.edges():
            if edge[0] in all_nets[i-1].nodes():
                net.add_node(edge[0], color='blue')
            else:
                net.add_node(edge[0], color='green')
            if edge[1] in all_nets[i-1].nodes():
                net.add_node(edge[1], color='blue')
            else:
                net.add_node(edge[1], color='green')

            net.add_edge(edge[0], edge[1])
    
    net.show('nx_{}.html'.format(year))

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 


In [17]:
df.head()

Unnamed: 0,created_at,author_id,lang,text,id,public_metrics,withheld,script_lang,translated_text,POS,LID,CMI,Langugage_hueristic,follower_count,year,month,day,date,date2
0,2014-03-01 10:28:08+00:00,2371767000.0,hi,अपन देशक नेता जी : कविता: चोर उचक्का नेता बनी केघोटाला पर जे घोटाला करतघैयला म जे भुर रहत ततहन पैइन कोना क भरत... http://t.co/V4RC5FUSbx,4.397087e+17,"{'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}",,hindi,apan deshak netaa jee : kavita: chor uchakka netaa bunny keghotala parr jey ghotala karatghaiyala maam jey bhur rahat tatahan paine cona kaa bharat... http://t.co/v4rc5fusbx,"{""apan"": ""ADV"", ""deshak"": ""NOUN"", ""netaa"": ""VERB"", ""jee"": ""VERB"", "":"": ""X"", ""kavita"": ""NOUN"", ""chor"": ""NOUN"", ""uchakka"": ""VERB"", ""bunny"": ""NOUN"", ""keghotala"": ""VERB"", ""parr"": ""PART"", ""jey"": ""PRON"", ""ghotala"": ""VERB"", ""karatghaiyala"": ""VERB"", ""maam"": ""ADP"", ""bhur"": ""VERB"", ""rahat"": ""VERB"", ""tatahan"": ""NOUN"", ""paine"": ""VERB"", ""cona"": ""VERB"", ""kaa"": ""VERB"", ""bharat"": ""PROPN"", ""."": ""X"", ""http"": ""X"", ""/"": ""X"", ""t"": ""X"", ""co"": ""X"", ""v4rc5fusbx"": ""X""}","{""apan"": ""hin"", ""deshak"": ""hin"", ""netaa"": ""hin"", ""jee"": ""hin"", "":"": ""other"", ""kavita"": ""hin"", ""chor"": ""hin"", ""uchakka"": ""hin"", ""bunny"": ""hin"", ""keghotala"": ""hin"", ""parr"": ""hin"", ""jey"": ""hin"", ""ghotala"": ""hin"", ""karatghaiyala"": ""hin"", ""maam"": ""hin"", ""bhur"": ""hin"", ""rahat"": ""hin"", ""tatahan"": ""hin"", ""paine"": ""hin"", ""cona"": ""hin"", ""kaa"": ""hin"", ""bharat"": ""ne"", ""."": ""other"", ""http"": ""other"", ""/"": ""other"", ""t"": ""en"", ""co"": ""en"", ""v4rc5fusbx"": ""other""}",0.714286,Code-mixed,1.0,2014,1,1,1/1/2014,1/2014
1,2014-03-01 20:37:07+00:00,131116500.0,hi,Shri. Pranab Mukherjee: Do not change the name of Shankaracharya Hill in Kashmir http://t.co/NV6fL7SdEH via @Change4India,4.398619e+17,"{'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}",,,Shri. Pranab Mukherjee: Do not change the name of Shankaracharya Hill in Kashmir http://t.co/NV6fL7SdEH via @Change4India,"{""Shri"": ""PROPN"", ""."": ""X"", ""Pranab"": ""PROPN"", ""Mukherjee"": ""PROPN"", "":"": ""X"", ""Do"": ""VERB"", ""not"": ""PART_NEG"", ""change"": ""VERB"", ""the"": ""DET"", ""name"": ""NOUN"", ""of"": ""ADP"", ""Shankaracharya"": ""PROPN"", ""Hill"": ""PROPN"", ""in"": ""ADP"", ""Kashmir"": ""PROPN"", ""http"": ""X"", ""/"": ""X"", ""t"": ""X"", ""co"": ""X"", ""NV6fL7SdEH"": ""X"", ""via"": ""ADP"", ""@"": ""X"", ""Change4India"": ""X""}","{""Shri"": ""ne"", ""."": ""other"", ""Pranab"": ""ne"", ""Mukherjee"": ""ne"", "":"": ""other"", ""Do"": ""en"", ""not"": ""en"", ""change"": ""en"", ""the"": ""en"", ""name"": ""en"", ""of"": ""en"", ""Shankaracharya"": ""ne"", ""Hill"": ""ne"", ""in"": ""en"", ""Kashmir"": ""ne"", ""http"": ""other"", ""/"": ""other"", ""t"": ""en"", ""co"": ""en"", ""NV6fL7SdEH"": ""en"", ""via"": ""en"", ""@"": ""other"", ""Change4India"": ""ne""}",0.478261,English,,2014,1,1,1/1/2014,1/2014
2,2014-01-01 17:37:06+00:00,89146340.0,hi,ae dil kis baat par khush hae tu? Din to badal jaate hae...dil nahi ...a very happy 2014..cheers!,4.184357e+17,"{'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}",,,ae dil kis baat par khush hae tu? Din to badal jaate hae...dil nahi ...a very happy 2014..cheers!,"{""ae"": ""DET"", ""dil"": ""NOUN"", ""kis"": ""ADJ"", ""baat"": ""NOUN"", ""par"": ""ADP"", ""khush"": ""VERB"", ""hae"": ""VERB"", ""tu"": ""PRON"", ""?"": ""X"", ""Din"": ""NOUN"", ""to"": ""PART"", ""badal"": ""VERB"", ""jaate"": ""VERB"", ""."": ""X"", ""nahi"": ""PART_NEG"", ""a"": ""DET"", ""very"": ""ADV"", ""happy"": ""ADJ"", ""2014"": ""NUM"", ""cheers"": ""VERB"", ""!"": ""X""}","{""ae"": ""hin"", ""dil"": ""hin"", ""kis"": ""hin"", ""baat"": ""hin"", ""par"": ""hin"", ""khush"": ""hin"", ""hae"": ""hin"", ""tu"": ""hin"", ""?"": ""other"", ""Din"": ""hin"", ""to"": ""hin"", ""badal"": ""hin"", ""jaate"": ""hin"", ""."": ""other"", ""nahi"": ""hin"", ""a"": ""en"", ""very"": ""en"", ""happy"": ""en"", ""2014"": ""other"", ""cheers"": ""en"", ""!"": ""other""}",0.619048,Code-mixed,1.0,2014,1,1,1/1/2014,1/2014
3,2014-03-01 01:46:15+00:00,2371767000.0,hi,"बाबा भूतनाथ क होइत अछि एतय पूजा: बेनीपुर, महिनाम : क्षेत्र के महिनाम-पोहद्द गांव स्थित कमला नदी क तट पर दू सौ ... http://t.co/syrVgOF1UC",4.395773e+17,"{'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}",,hindi,"baaba bhutnath kaa hoit achhi etay pooja: benipur, mahinam : kshetra key mahinam-pohadda gaanv sthit kamala nadee kaa tata parr doo sau ... http://t.co/syrvgof1uc","{""baaba"": ""NOUN"", ""bhutnath"": ""PROPN"", ""kaa"": ""PART"", ""hoit"": ""VERB"", ""achhi"": ""ADJ"", ""etay"": ""ADJ"", ""pooja"": ""NOUN"", "":"": ""X"", ""benipur"": ""PROPN"", "","": ""X"", ""mahinam"": ""PROPN"", ""kshetra"": ""PART"", ""key"": ""ADP"", ""-"": ""X"", ""pohadda"": ""PART"", ""gaanv"": ""PART"", ""sthit"": ""VERB"", ""kamala"": ""VERB"", ""nadee"": ""PRON"", ""tata"": ""NOUN"", ""parr"": ""VERB"", ""doo"": ""VERB"", ""sau"": ""VERB"", ""."": ""X"", ""http"": ""X"", ""/"": ""X"", ""t"": ""X"", ""co"": ""X"", ""syrvgof1uc"": ""X""}","{""baaba"": ""hin"", ""bhutnath"": ""ne"", ""kaa"": ""hin"", ""hoit"": ""hin"", ""achhi"": ""hin"", ""etay"": ""hin"", ""pooja"": ""hin"", "":"": ""other"", ""benipur"": ""ne"", "","": ""other"", ""mahinam"": ""ne"", ""kshetra"": ""hin"", ""key"": ""hin"", ""-"": ""other"", ""pohadda"": ""fw"", ""gaanv"": ""ne"", ""sthit"": ""fw"", ""kamala"": ""hin"", ""nadee"": ""hin"", ""tata"": ""fw"", ""parr"": ""fw"", ""doo"": ""fw"", ""sau"": ""fw"", ""."": ""other"", ""http"": ""other"", ""/"": ""other"", ""t"": ""en"", ""co"": ""en"", ""syrvgof1uc"": ""other""}",0.344828,Code-mixed,1.0,2014,1,1,1/1/2014,1/2014
4,2015-02-01 09:07:31+00:00,1722622000.0,hi,"Salman Khan , Kapil sharma , Karan Johar , Sidharth Malhotra and Deepika Padukone @ #FilmfareAwards2015 http://t.co/ag1S0kO1PD",5.618131e+17,"{'retweet_count': 5, 'reply_count': 1, 'like_count': 7, 'quote_count': 0}",,,"Salman Khan , Kapil sharma , Karan Johar , Sidharth Malhotra and Deepika Padukone @ #FilmfareAwards2015 http://t.co/ag1S0kO1PD","{""Salman"": ""PROPN"", ""Khan"": ""PROPN"", "","": ""X"", ""Kapil"": ""PROPN"", ""sharma"": ""PROPN"", ""Karan"": ""PROPN"", ""Johar"": ""PROPN"", ""Sidharth"": ""PROPN"", ""Malhotra"": ""PROPN"", ""and"": ""CONJ"", ""Deepika"": ""PROPN"", ""Padukone"": ""PROPN"", ""@"": ""X"", ""#"": ""X"", ""FilmfareAwards2015"": ""X"", ""http"": ""X"", "":"": ""X"", ""/"": ""X"", ""t"": ""X"", ""."": ""X"", ""co"": ""X"", ""ag1S0kO1PD"": ""X""}","{""Salman"": ""ne"", ""Khan"": ""ne"", "","": ""other"", ""Kapil"": ""ne"", ""sharma"": ""ne"", ""Karan"": ""ne"", ""Johar"": ""ne"", ""Sidharth"": ""ne"", ""Malhotra"": ""ne"", ""and"": ""en"", ""Deepika"": ""ne"", ""Padukone"": ""ne"", ""@"": ""other"", ""#"": ""ne"", ""FilmfareAwards2015"": ""other"", ""http"": ""other"", "":"": ""other"", ""/"": ""other"", ""t"": ""en"", ""."": ""other"", ""co"": ""en"", ""ag1S0kO1PD"": ""en""}",0.181818,English,,2015,1,1,1/1/2015,1/2015


In [18]:
import ast

In [19]:
words = []
poses = []

j = 0

for i in tqdm(range(df.shape[0])):
    dict_ = ast.literal_eval(df.POS.iloc[i])
    for word, pos in dict_.items():
        words.append(word)
        poses.append(pos)

100%|████████████████████████████████| 262578/262578 [00:19<00:00, 13499.11it/s]


In [20]:
word_pos = pd.DataFrame()
word_pos['Word'] = words
word_pos['POS'] = poses

In [21]:
word_pos.head()

Unnamed: 0,Word,POS
0,apan,ADV
1,deshak,NOUN
2,netaa,VERB
3,jee,VERB
4,:,X


In [22]:
word_pos['id'] = word_pos.index

In [23]:
word_pos = word_pos.groupby(['Word','POS'])['id'].count().sort_values()[::-1].reset_index(drop=False).drop_duplicates(['Word']).reset_index(drop=True)


In [24]:
word_pos.head()

Unnamed: 0,Word,POS,id
0,.,X,228451
1,:,X,175457
2,/,X,173081
3,t,X,171180
4,co,X,170980


In [25]:
word_pos[word_pos.Word == 'government']

Unnamed: 0,Word,POS,id
510,government,NOUN,1089


In [26]:
words = []
word_retain_perc = []
years = []

for year in tqdm([2016,2017,2018,2019,2020,2021]):
    vocab = models[year].wv.index_to_key
    
    for word in vocab:
        similar_words_base = set([i[0] for i in models[year].wv.most_similar(word, topn=25)])
        similar_words_future = set([i[0] for i in models[year+1].wv.most_similar(word, topn=25)])
    
        word_retain_perc.append(len(similar_words_base.intersection(similar_words_future))/len(similar_words_base))
        words.append(word)
        years.append(year)

100%|█████████████████████████████████████████████| 6/6 [01:25<00:00, 14.19s/it]


In [27]:
word_retain_perc_df = pd.DataFrame()
word_retain_perc_df['Word'] = words
word_retain_perc_df['Retain_Perc'] = word_retain_perc
word_retain_perc_df['Year'] = years

In [28]:
word_retain_perc_df = word_retain_perc_df.groupby(['Word','Year'])['Retain_Perc'].mean().reset_index(drop=False)
word_retain_perc_df2 = word_retain_perc_df.groupby(['Word'])['Retain_Perc'].mean().reset_index(drop=False)


In [29]:
word_retain_perc_df = pd.merge(word_retain_perc_df, word_pos, how='inner')
word_retain_perc_df2 = pd.merge(word_retain_perc_df2, word_pos, how='inner')

In [30]:
word_retain_perc_df.head()

Unnamed: 0,Word,Year,Retain_Perc,POS,id
0,!,2016,0.0,X,30885
1,!,2017,0.0,X,30885
2,!,2018,0.08,X,30885
3,!,2019,0.56,X,30885
4,!,2020,0.36,X,30885


In [31]:
word_retain_perc_df2.head()

Unnamed: 0,Word,Retain_Perc,POS,id
0,!,0.293333,X,30885
1,"""",0.166667,X,10782
2,#,0.193333,X,89092
3,$,0.08,X,86
4,%,0.193333,X,1619


In [32]:
word_retain_perc_df = word_retain_perc_df.sort_values(['Retain_Perc'], ascending=[False]).reset_index(drop=True)

In [33]:
word_retain_perc_df.head(10)

Unnamed: 0,Word,Year,Retain_Perc,POS,id
0,हो,2021,0.92,VERB,4
1,रहा,2021,0.92,VERB,3
2,लिए,2021,0.92,ADP,1
3,नमन,2020,0.88,PROPN,4
4,alia,2021,0.88,PROPN,3
5,करने,2021,0.84,VERB,1
6,बाद,2021,0.84,ADV,1
7,जाये,2021,0.84,VERB,1
8,की,2021,0.84,ADP,5
9,।,2021,0.84,X,5


In [34]:
word_retain_perc_df2 = word_retain_perc_df2.sort_values(['Retain_Perc'], ascending=[False]).reset_index(drop=True)

In [35]:
word_retain_perc_df2.head(10)

Unnamed: 0,Word,Retain_Perc,POS,id
0,shahrukh,0.666667,PART,152
1,navi,0.653333,NOUN,39
2,kapoor,0.646667,NOUN,175
3,radhey,0.64,VERB,5
4,rukh,0.64,VERB,102
5,deepika,0.626667,ADJ,35
6,bhutan,0.626667,PROPN,23
7,shared,0.62,VERB,97
8,ranveer,0.62,NOUN,55
9,khan,0.613333,PROPN,502


In [47]:
word_retain_perc_df2[word_retain_perc_df2.POS == 'DET']

Unnamed: 0,Word,Retain_Perc,POS,id
21,a,0.566667,DET,885
47,the,0.520000,DET,1673
101,एक,0.446667,DET,2
114,unmesh,0.440000,DET,7
128,इस,0.433333,DET,1
...,...,...,...,...
4020,iit,0.020000,DET,55
4260,andhe,0.000000,DET,45
4273,isne,0.000000,DET,9
4514,koy,0.000000,DET,47


In [36]:
word_retain_perc_df3 = word_retain_perc_df.groupby(['POS'])['Retain_Perc'].mean().reset_index(drop=False).sort_values(['Retain_Perc']).reset_index(drop=True)

In [37]:
word_retain_perc_df4 = word_retain_perc_df.groupby(['POS','Year'])['Retain_Perc'].mean().reset_index(drop=False)

In [38]:
word_retain_perc_df3

Unnamed: 0,POS,Retain_Perc
0,NOUN,0.142551
1,PART,0.148052
2,VERB,0.153838
3,ADJ,0.166202
4,X,0.176681
5,ADV,0.178882
6,NUM,0.19
7,ADP,0.192777
8,PRON,0.200354
9,PRON_WH,0.222105


In [39]:
word_retain_perc_df4

Unnamed: 0,POS,Year,Retain_Perc
0,ADJ,2016,0.120909
1,ADJ,2017,0.124444
2,ADJ,2018,0.142174
3,ADJ,2019,0.140155
4,ADJ,2020,0.166355
...,...,...,...
79,X,2017,0.122500
80,X,2018,0.162759
81,X,2019,0.164156
82,X,2020,0.182222


In [45]:
word_retain_perc_df3.groupby(['POS'])['Retain_Perc'].mean().sort_values()[::-1]

POS
PART_NEG    0.269545
CONJ        0.239683
PROPN       0.224822
DET         0.224487
PRON_WH     0.222105
PRON        0.200354
ADP         0.192777
NUM         0.190000
ADV         0.178882
X           0.176681
ADJ         0.166202
VERB        0.153838
PART        0.148052
NOUN        0.142551
Name: Retain_Perc, dtype: float64

In [40]:
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure
import math

In [41]:
output_notebook()

In [42]:
p = figure(height=400, width=1000, x_range=word_retain_perc_df3.POS.unique())

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

p.xaxis.axis_label = 'PoS Tag'
p.yaxis.axis_label = 'Probability of retaining meaning'
#

p.vbar(x=word_retain_perc_df3.POS, top=word_retain_perc_df3.Retain_Perc, bottom=0, width=0.5, color='orange')

#p.add_layout(p.legend[0], 'center')

p.xaxis.axis_label_text_font_style = "normal"
p.yaxis.axis_label_text_font_style = "normal"


show(p)

In [47]:
import colorcet as cc
from bokeh.transform import dodge, factor_cmap
from bokeh.io import export_png, export_svg
import random
from bokeh.models import TickFormatter
from bokeh.core.properties import Dict, Int, String

class FixedTickFormatter(TickFormatter):
    """
    Class used to allow custom axis tick labels on a bokeh chart
    Extends bokeh.model.formatters.TickFormatte
    """

    JS_CODE =  """
        import {Model} from "model"
        import * as p from "core/properties"

        export class FixedTickFormatter extends Model
          type: 'FixedTickFormatter'
          doFormat: (ticks) ->
            labels = @get("labels")
            return (labels[tick] ? "" for tick in ticks)
          @define {
            labels: [ p.Any ]
          }
    """

    labels = Dict(Int, String, help="""
    A mapping of integer ticks values to their labels.
    """)

    __implementation__ = JS_CODE
    
palette = random.sample(cc.rainbow,20)

In [48]:
label_dict = {}
for i in years:
    if i in [2016,2021]:
        label_dict[i] = str(i)
    else:
        label_dict[i] = ""

In [102]:
#p.vbar_stack(word_retain_perc_df4.Year.unique(), x=word_retain_perc_df4.POS, top=word_retain_perc_df4.Retain_Perc, bottom=0, width=0.5, legend=)

poses = word_retain_perc_df4.POS.unique()
years = word_retain_perc_df4.Year.unique()

x = [ (pos, str(year)) for pos in poses for year in years]

vals = [word_retain_perc_df4.loc[(word_retain_perc_df4.Year == int(i[1])) & \
                                 (word_retain_perc_df4.POS == i[0]), "Retain_Perc"].iloc[0] for i in x]

source = ColumnDataSource(data=dict(x=x, counts=vals))

p = figure(x_range=FactorRange(*x), height=600, width=1400, title="")


p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

p.yaxis.axis_label = 'Probability of retaining meaning'
p.xaxis.axis_label = 'PoS categories over the years'

p.vbar(x='x', top='counts', width=0.5, source=source, bottom=0, fill_color=factor_cmap('x', palette=palette, factors=[str(i) for i in years], start=1, end=2))

#p.add_layout(p.legend[0], 'center')

p.xaxis.axis_label_text_font_style = "normal"
p.yaxis.axis_label_text_font_style = "normal"

p.xaxis.major_label_orientation = math.pi/4

p.xaxis.axis_label_text_font_size = "20pt"
p.yaxis.axis_label_text_font_size = "20pt"

p.xaxis.major_label_text_font_size = "12pt"
p.yaxis.major_label_text_font_size = "12pt"

#p.xaxis.axis_label_text_font_size = "12pt"
#p.yaxis.axis_label_text_font_size = "12pt"

p.legend.title_text_font_size = '12pt'
 
# Increasing the labels of the box
# to 30pt
p.legend.label_text_font_size = "12pt"

p.output_backend = "svg"
p.below[0].group_text_font_size = '12pt'
#p.xaxis[0].formatter = FixedTickFormatter(labels=label_dict)

export_svg(p, filename="/Users/victor/Downloads/figure5.svg")

show(p)

You are attempting to set `plot.legend.title_text_font_size` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set.

  p.legend.title_text_font_size = '12pt'
You are attempting to set `plot.legend.label_text_font_size` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set.

  p.legend.label_text_font_size = "12pt"


In [50]:
word_pos[word_pos.POS == 'PROPN']

Unnamed: 0,Word,POS,id
64,modi,PROPN,7361
81,congress,PROPN,6265
84,narendramodi,PROPN,6075
153,bjp,PROPN,3484
161,pradhaanmantri,PROPN,3280
...,...,...,...
411266,legendactordilipkumar,PROPN,1
411508,likhimpur,PROPN,1
411523,lightwala,PROPN,1
411559,lijjatpapad,PROPN,1


In [107]:
seed_word = 'khan'

all_nets = {}

for i, year in enumerate([2018,2019,2020,2021,2022]):
    
    net = Network(notebook=True)
    G = nx.Graph()

    similar_words = [i[0] for i in models[year].wv.most_similar(seed_word, topn=5)]

    G.add_node(seed_word)

    for word in similar_words:
        if word not in G.nodes():
            G.add_node(word)
            G.add_edge(seed_word,word)

        similar_words2 = [i[0] for i in models[year].wv.most_similar(word, topn=5)]
        for word2 in similar_words2:
            if word2 in G.nodes():
                G.add_edge(word, word2)
    
    all_nets[i] = G
    
    if i == 0:
        for edge in G.edges():
            net.add_node(edge[0], color='blue')
            net.add_node(edge[1], color='blue')

            net.add_edge(edge[0], edge[1])
    else:
        for node in all_nets[i-1].nodes():
            if node not in G.nodes():
                pass
                #net.add_node(node, color='red')
            else:
                net.add_node(node, color='blue')
        
        for edge in G.edges():
            if edge[0] in all_nets[i-1].nodes():
                net.add_node(edge[0], color='blue')
            else:
                net.add_node(edge[0], color='green')
            if edge[1] in all_nets[i-1].nodes():
                net.add_node(edge[1], color='blue')
            else:
                net.add_node(edge[1], color='green')

            net.add_edge(edge[0], edge[1])
    
    net.show('nx2_{}.html'.format(year))

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 


In [52]:
word_retain_perc_df2[word_retain_perc_df2.POS == 'PROPN']

Unnamed: 0,Word,Retain_Perc,POS,id
1,ali,0.656000,PROPN,26
4,arjun,0.640000,PROPN,127
10,khan,0.606667,PROPN,502
14,purab,0.600000,PROPN,38
15,bhutan,0.600000,PROPN,23
...,...,...,...,...
4564,feb,0.000000,PROPN,14
4565,taliban,0.000000,PROPN,265
4581,bharatiya,0.000000,PROPN,19
4587,vivo,0.000000,PROPN,9


In [67]:
dict(models[2018].wv.most_similar('khan', topn=20))

{'khurshid': 0.9835996031761169,
 'sohail': 0.9648109674453735,
 'kapoor': 0.9621816873550415,
 'shahrukh': 0.9596378803253174,
 'deepika': 0.9551153182983398,
 'katrina': 0.9548155665397644,
 'ranveer': 0.9537448287010193,
 'arjun': 0.9531123042106628,
 'shah': 0.9505767226219177,
 'kareena': 0.9502365589141846,
 '&amp;': 0.9484456777572632,
 'rukh': 0.9472284317016602,
 'singh': 0.9464266896247864,
 'and': 0.9454678297042847,
 'bhatt': 0.9454249739646912,
 'sultan': 0.941861629486084,
 'priyanka': 0.9384653568267822,
 'khan,': 0.9360941648483276,
 'for': 0.9347765445709229,
 'alia': 0.9333627223968506}

In [104]:
seed_word = 'government'
similar_word_count = 20

x_label = [2022,2021,2020,2019,2018,2017,2016]
similarity_matrix = np.zeros((len(x_label),similar_word_count))
y_label = [i[0] for i in models[x_label[0]].wv.most_similar(seed_word, topn=similar_word_count)]
similarity_matrix[0,:] = [i[1] for i in models[x_label[0]].wv.most_similar(seed_word, topn=similar_word_count)]

for j, year in enumerate(x_label[1:]):
    similar_words = [i[0] for i in models[year].wv.most_similar(seed_word, topn=similar_word_count)]
    for i, word in enumerate(y_label):
        if word in similar_words:
            similarity_matrix[j+1,i] = dict(models[year].wv.most_similar(seed_word, topn=similar_word_count))[word]
        else:
            similarity_matrix[j+1,i] = 0
            
data = pd.DataFrame(similarity_matrix.T)
data.columns = [str(i) for i in x_label]
data.index = y_label

data['Name'] = data.index.astype(str)
data = data.set_index('Name')
data.columns.name = 'Year'

names = list(data.index)
years = list(data.columns)

# reshape to 1D array or rates with a month and year for each row.
df = pd.DataFrame(data.stack(), columns=['rate']).reset_index()

# this is the colormap from the original NYTimes plot
#colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
#mapper = LinearColorMapper(palette=colors, low=df.rate.min(), high=df.rate.max())

p = figure(title="",
           x_range=names, y_range=years,
           x_axis_location="above", width=900, height=400)

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "7px"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 5

p.rect(x="Name", y="Year", width=1, height=1,
       source=df,
       fill_color={'field': 'rate', 'transform': mapper},
       line_color=None)

color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="12px",
                     ticker=BasicTicker(desired_num_ticks=len(colors)),
                     label_standoff=6, border_line_color='black')
p.add_layout(color_bar, 'right')

p.xaxis.axis_label_text_font_size = "20pt"
p.yaxis.axis_label_text_font_size = "20pt"

p.xaxis.major_label_text_font_size = "12pt"
p.yaxis.major_label_text_font_size = "12pt"

#p.xaxis.axis_label_text_font_size = "12pt"
#p.yaxis.axis_label_text_font_size = "12pt"

p.legend.title_text_font_size = '12pt'
 
# Increasing the labels of the box
# to 30pt
p.legend.label_text_font_size = "12pt"

#p.output_backend = "svg"

#export_png(p, filename="/Users/victor/Downloads/figure5a.png")

show(p)

You are attempting to set `plot.legend.title_text_font_size` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set.

  p.legend.title_text_font_size = '12pt'
You are attempting to set `plot.legend.label_text_font_size` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set.

  p.legend.label_text_font_size = "12pt"


In [105]:
seed_word = 'khan'
similar_word_count = 20

x_label = [2022,2021,2020,2019,2018,2017,2016]
similarity_matrix = np.zeros((len(x_label),similar_word_count))
y_label = [i[0] for i in models[x_label[0]].wv.most_similar(seed_word, topn=similar_word_count)]
similarity_matrix[0,:] = [i[1] for i in models[x_label[0]].wv.most_similar(seed_word, topn=similar_word_count)]

for j, year in enumerate(x_label[1:]):
    similar_words = [i[0] for i in models[year].wv.most_similar(seed_word, topn=similar_word_count)]
    for i, word in enumerate(y_label):
        if word in similar_words:
            similarity_matrix[j+1,i] = dict(models[year].wv.most_similar(seed_word, topn=similar_word_count))[word]
        else:
            similarity_matrix[j+1,i] = 0
            
data = pd.DataFrame(similarity_matrix.T)
data.columns = [str(i) for i in x_label]
data.index = y_label

data['Name'] = data.index.astype(str)
data = data.set_index('Name')
data.columns.name = 'Year'

names = list(data.index)
years = list(data.columns)

# reshape to 1D array or rates with a month and year for each row.
df = pd.DataFrame(data.stack(), columns=['rate']).reset_index()

# this is the colormap from the original NYTimes plot
#colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
#mapper = LinearColorMapper(palette=colors, low=df.rate.min(), high=df.rate.max())

p = figure(title="",
           x_range=names, y_range=years,
           x_axis_location="above", width=900, height=400)

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "7px"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 5

p.rect(x="Name", y="Year", width=1, height=1,
       source=df,
       fill_color={'field': 'rate', 'transform': mapper},
       line_color=None)

color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="12px",
                     ticker=BasicTicker(desired_num_ticks=len(colors)),
                     label_standoff=6, border_line_color='black')
p.add_layout(color_bar, 'right')

p.xaxis.axis_label_text_font_size = "20pt"
p.yaxis.axis_label_text_font_size = "20pt"

p.xaxis.major_label_text_font_size = "12pt"
p.yaxis.major_label_text_font_size = "12pt"

#p.xaxis.axis_label_text_font_size = "12pt"
#p.yaxis.axis_label_text_font_size = "12pt"

p.legend.title_text_font_size = '12pt'
 
# Increasing the labels of the box
# to 30pt
p.legend.label_text_font_size = "12pt"

#p.output_backend = "svg"

export_png(p, filename="/Users/victor/Downloads/figure5b.png")

show(p)

You are attempting to set `plot.legend.title_text_font_size` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set.

  p.legend.title_text_font_size = '12pt'
You are attempting to set `plot.legend.label_text_font_size` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set.

  p.legend.label_text_font_size = "12pt"
