Name: Suman Roy

Roll Number: MDS202041

NLP Assignment 02

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import numpy.matlib as mat
import scipy
from scipy.sparse.linalg import svds
from scipy import linalg
from gensim.models import Word2Vec, word2vec
import logging
import gensim

### Importing the words 

Here I have used top 14000 words(in terms of frequency) as described in the paper for calculating the word vectors. The words are being imported from the vocabulary-frequency-rank table that was generated in Assignment 1. After importing the top 14000 words, we have sorted them alphabetically. 

In [None]:
def import_vocab(file_path, top_words):
  df=pd.read_csv(file_path)
  words=list(df['Word'].astype(str)[0:top_words])
  words.sort()
  return words


In [None]:
file_path='/content/drive/MyDrive/CMI/Semester 4/NLP/Assignment 1/frequency.csv'
top_words=14000
vocab_list= import_vocab(file_path,top_words)

The preprocessed and lemmatized corpus (whole) is being imported. The corpus is then tokenized for further use. 

In [None]:
def import_token(file_path):
  with open(file_path,'r') as f:
    text=f.read()
  text_list=text.split("\n")
  token=[]
  for text in text_list:
    token.extend(text.split(" "))
  return token


In [None]:
file_path='/content/drive/MyDrive/CMI/Semester 4/NLP/Assignment 1/corpus_english_lemma.txt'
token_list=import_token(file_path)

Total number of words/tokens present in the corpus

In [None]:
len(token_list)

114619560

### Implementing the COALS algorithm

**Cooccurrence Matrix** 

Gathering co-occurrence counts, typically ignoring
closed-class neighbors and using a ramped, size 4
window

In [None]:
def cooccurrence_matrix(vocab_list, token_list, window=4):
    idx={}
    i=0
    for vocab in vocab_list:
      idx[vocab]=i
      i=i+1
    mat=np.zeros((len(vocab_list),len(vocab_list)), np.float32)
    for i in tqdm(range(len(token_list))):
      try:
        x = idx[token_list[i]]
        count = window
        if i>(len(token_list)-window):  #when the word is near the end(window size=4) of the token list
          for word in token_list[i+1:len(token_list)]:
              try:
                  y = idx[word]
                  mat[x, y] +=count     #incrementing the corresponding element of cooccurrence matrix
                  mat[y, x] +=count
              except:
                  continue
              count -= 1                #decreasing the count value for next iteration
        else:
          for word in token_list[i+1:i+1+window]:
              try:
                  y = idx[word]
                  mat[x, y] +=count
                  mat[y, x] +=count
              except:
                  continue
              count -= 1
      except:
          continue
    return mat

In [None]:
matrix=cooccurrence_matrix(vocab_list,token_list)

100%|██████████| 114619560/114619560 [44:47<00:00, 42643.98it/s]


Saving the matrix for future use

In [None]:
np.save('/content/drive/MyDrive/CMI/Semester 4/NLP/Assignment 2/coocc', matrix)

Loading the cooccurrence matrix and converting into numpy array for further operation

In [None]:
matrix=np.load('/content/drive/MyDrive/CMI/Semester 4/NLP/Assignment 2/coocc.npy')

**Correlation Matrix**

Converting counts to word pair correlations, setting negative
values to 0, and taking square roots of positive ones. 

In [None]:
def correlation_matrix(matrix):
  sum=np.sum(matrix)
  a=mat.repmat(np.sum(matrix, axis=1), matrix.shape[0], 1).T       #row wise sum
  b=mat.repmat(np.sum(matrix, axis=1), matrix.shape[0], 1)         #column wise sum
  #calculating the correlation values
  matrix_2 = (sum * matrix - a * b) / np.sqrt(a * (sum - a) * b * (sum - b))    
  matrix_2[np.where(matrix_2<0)]=0      #making the negative values 0
  matrix_2= np.sqrt(matrix_2)           #Taking square roots of the positive values
  np.save('/content/drive/MyDrive/CMI/Semester 4/NLP/Assignment 2/corr', matrix_2)
  return matrix_2

In [None]:
matrix2=correlation_matrix(matrix)

  


Saving the correlation matrix for future use

In [None]:
np.save('/content/drive/MyDrive/CMI/Semester 4/NLP/Assignment 2/corel', matrix2)

### Using SVD to reduce word vector size to 50

In [None]:
def singular_value_decomposition(array, k=50):
  u, sigma, v = svds(matrix, k=k)
  i=np.linalg.inv(np.diag(sigma))
  svd = array @ v.T @ i
  np.save('/content/drive/MyDrive/CMI/Semester 4/NLP/Assignment 2/svd', svd)
  return svd

In [None]:
svd_matrix=singular_value_decomposition(matrix2)

### Computing Word vectors

In [None]:
def word_2_vec(vocab,svd_matrix,corpus_file_path, frequency_dict, no_of_tokens, size=50, epochs=20):
  logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  text = word2vec.Text8Corpus(corpus_file_path)
  vocab_list = {k:v for v,k in enumerate(vocab)}
  model = word2vec.Word2Vec(vector_size=size, window=5, min_count=1, workers=10)
  model.build_vocab_from_freq(frequency_dict)  #building the vocab from frequency dictionary of top 14000 words
  i = []
  for x in model.wv.key_to_index :
    i.append(x)
  model.wv.vectors = svd_matrix[i,:]           #taking the corresponding word vector
  model.train(text, total_words = no_of_tokens, epochs = epochs)
  model.save('/content/drive/MyDrive/CMI/Semester 4/NLP/Assignment 2/word2vec.bin')

For building the word2vec model we will need the frequency of the vocabulary. For that we are importing the frequencies of the top 14000 words from the vocabulary-frequency table generated in Assignment 1. 

In [None]:
def import_freq(file_path, top_words):
  df=pd.read_csv(file_path)
  df=df.drop(['Unnamed: 0','Rank'],axis=1)
  words=list(df['Word'].astype(str)[0:top_words])
  frequency=list(df['Frequency'].astype(int)[0:top_words])
  res = dict(zip(words, frequency))
  return res

In [None]:
file_path='/content/drive/MyDrive/CMI/Semester 4/NLP/Assignment 1/frequency.csv'
top_words=14000

In [None]:
vocab= import_vocab(file_path,top_words)
svd_matrix=np.load('/content/drive/MyDrive/CMI/Semester 4/NLP/Assignment 2/svd.npy')
corpus_file_path='/content/drive/MyDrive/CMI/Semester 4/NLP/Assignment 1/corpus_english_lemma.txt'
frequency_dict=import_freq(file_path,top_words)
token_list=import_token(corpus_file_path)
no_of_tokens=len(token_list)

In [None]:
word_2_vec(vocab,svd_matrix,corpus_file_path, frequency_dict, no_of_tokens)

2022-05-18 11:03:39,822 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=50, alpha=0.025)', 'datetime': '2022-05-18T11:03:39.822810', 'gensim': '4.1.2', 'python': '3.7.13 (default, Apr 24 2022, 01:04:09) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
2022-05-18 11:03:39,828 : INFO : Processing provided word frequencies
2022-05-18 11:03:39,832 : INFO : collected 13999 unique word types, with total frequency of 107653420
2022-05-18 11:03:39,836 : INFO : Creating a fresh vocabulary
2022-05-18 11:03:39,900 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 13999 unique words (100.0%% of original 13999, drops 0)', 'datetime': '2022-05-18T11:03:39.900655', 'gensim': '4.1.2', 'python': '3.7.13 (default, Apr 24 2022, 01:04:09) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'prepare_vocab'}
2022-05-18 11:03:39,902 : INFO : Word2Vec lifecycle event {'msg': '

### Showing similarities of different words

Cosine distance score is also shown along with the word.

In [3]:
loaded_model = word2vec.Word2Vec.load('/content/drive/MyDrive/CMI/Semester 4/NLP/Assignment 2/word2vec.bin')

In [4]:
loaded_model.wv.most_similar('heart')

[('cardiac', 0.783564567565918),
 ('congestive', 0.7562394738197327),
 ('kidney', 0.7221207618713379),
 ('renal', 0.6990090012550354),
 ('diastolic', 0.6984332203865051),
 ('systolic', 0.6946483254432678),
 ('cardiovascular', 0.6914125084877014),
 ('ventricular', 0.6861984133720398),
 ('coronary', 0.6860154867172241),
 ('cerebrovascular', 0.6758089661598206)]

In [5]:
loaded_model.wv.most_similar('viral')

[('virus', 0.72967529296875),
 ('hcv', 0.6757947206497192),
 ('hcmv', 0.6516889929771423),
 ('iav', 0.6453876495361328),
 ('hrsv', 0.6335877180099487),
 ('hdv', 0.6121329665184021),
 ('viruses', 0.6093374490737915),
 ('coronaviral', 0.5980815887451172),
 ('hbv', 0.5945049524307251),
 ('hsv', 0.5907920002937317)]

In [6]:
loaded_model.wv.most_similar('covid')

[('pandemic', 0.6143938899040222),
 ('confirmed', 0.5901299715042114),
 ('ncov', 0.5846263766288757),
 ('epicenter', 0.5828682780265808),
 ('evd', 0.5725947618484497),
 ('hospitalize', 0.5537525415420532),
 ('hospitalise', 0.5531996488571167),
 ('case', 0.548512876033783),
 ('midst', 0.5446000099182129),
 ('patient', 0.5445428490638733)]

In [7]:
loaded_model.wv.most_similar('cold')

[('chill', 0.7110450863838196),
 ('cough', 0.6437053680419922),
 ('winter', 0.614120602607727),
 ('urticaria', 0.609681248664856),
 ('warm', 0.6082454919815063),
 ('rhinorrhea', 0.5972345471382141),
 ('agitation', 0.5842854380607605),
 ('ailment', 0.5827311873435974),
 ('rhinitis', 0.5825480222702026),
 ('sore', 0.5798322558403015)]

In [8]:
loaded_model.wv.most_similar('lockdown')

[('confinement', 0.8739829063415527),
 ('lockdowns', 0.8575223684310913),
 ('shutdown', 0.8300986886024475),
 ('curfew', 0.8289132118225098),
 ('reopen', 0.8229773640632629),
 ('lift', 0.7972257137298584),
 ('lock', 0.7767547369003296),
 ('quarantine', 0.7496818900108337),
 ('imposition', 0.7469546794891357),
 ('npis', 0.7274476289749146)]

In [9]:
loaded_model.wv.most_similar('lung')

[('pulmonary', 0.8010280728340149),
 ('lungs', 0.7855331897735596),
 ('alveolar', 0.7081800103187561),
 ('intrapulmonary', 0.6906548142433167),
 ('airways', 0.6842151284217834),
 ('parenchymal', 0.6834618449211121),
 ('airway', 0.6828495860099792),
 ('bronchial', 0.682693362236023),
 ('bronchi', 0.6560800075531006),
 ('liver', 0.6528838872909546)]

In [10]:
loaded_model.wv.most_similar('immunity')

[('humoral', 0.8019682168960571),
 ('reinfection', 0.7462826371192932),
 ('immune', 0.7384793758392334),
 ('response', 0.706425666809082),
 ('protection', 0.6853342056274414),
 ('defence', 0.672511637210846),
 ('innate', 0.6704187393188477),
 ('responses', 0.6579158902168274),
 ('defense', 0.6517399549484253),
 ('immunologically', 0.6497045159339905)]

In [11]:
loaded_model.wv.most_similar('mortality')

[('incidence', 0.7872765064239502),
 ('fatality', 0.7764528393745422),
 ('lethality', 0.7539899945259094),
 ('morbidity', 0.7529518008232117),
 ('death', 0.7231834530830383),
 ('hospitalisation', 0.7110221982002258),
 ('hospitalization', 0.6924735903739929),
 ('readmission', 0.6820756793022156),
 ('survival', 0.625756025314331),
 ('cfr', 0.6166366338729858)]

In [12]:
loaded_model.wv.most_similar('virus')

[('viruses', 0.8439912796020508),
 ('paramyxovirus', 0.7730075120925903),
 ('reovirus', 0.7610743641853333),
 ('poliovirus', 0.7383607625961304),
 ('viral', 0.7296753525733948),
 ('arenavirus', 0.7191992402076721),
 ('flavivirus', 0.7011666893959045),
 ('filovirus', 0.693818986415863),
 ('coronaviruses', 0.6937196254730225),
 ('iav', 0.6817483305931091)]

In [13]:
loaded_model.wv.most_similar('economic')

[('economy', 0.8404459357261658),
 ('societal', 0.7840046286582947),
 ('livelihood', 0.759036660194397),
 ('financial', 0.7444256544113159),
 ('tourism', 0.7330431938171387),
 ('capital', 0.7220333814620972),
 ('globalization', 0.7201444506645203),
 ('downturn', 0.7171493172645569),
 ('poverty', 0.712391197681427),
 ('political', 0.7024533152580261)]