# Getting The Corpus

In [1]:
import numpy as np
import pandas as pd

In [2]:
import sklearn as sk

In [3]:
from sklearn.datasets import fetch_20newsgroups

In [4]:
ng = fetch_20newsgroups(subset='all',remove=('headers','footers','quotes'))
documents = ng.data[:10000]

In [5]:
type(documents)

list

In [6]:
len(documents)

10000

In [7]:
import pandas as pd

In [8]:
df = pd.DataFrame(documents)
df.columns = ["text"]

In [9]:
df.head()

Unnamed: 0,text
0,\n\nI am sure some bashers of Pens fans are pr...
1,My brother is in the market for a high-perform...
2,\n\n\n\n\tFinally you said what you dream abou...
3,\nThink!\n\nIt's the SCSI card doing the DMA t...
4,1) I have an old Jasmine drive which I cann...


# Preprocessing Pipeline

## Normalization

In [10]:
from concurrent.futures import ThreadPoolExecutor
import os

In [11]:
def normalize_text(text):
  return text.lower()

In [12]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
  df["text"] = list(pool.map(normalize_text,df["text"]))


In [13]:
df["text"]

Unnamed: 0,text
0,\n\ni am sure some bashers of pens fans are pr...
1,my brother is in the market for a high-perform...
2,\n\n\n\n\tfinally you said what you dream abou...
3,\nthink!\n\nit's the scsi card doing the dma t...
4,1) i have an old jasmine drive which i cann...
...,...
9995,gk>i hear that tires for this car can get real...
9996,"\ntod, i think you've misspoke. if they're ba..."
9997,\n\n\n\n\n\nyep! sounds good to me. suggestion...
9998,anyone familiar with this video card? what chi...


## Fixing Contractions

In [14]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m 

In [15]:
import contractions
import re

In [16]:
def fix_contractions(text):
  return contractions.fix(text)

In [17]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
  df["text"] = list(pool.map(fix_contractions,df["text"]))

# Removing Noisy Tokens

In [18]:
def remove_noisy_tokens(text):

    return re.sub(pattern=r'@[a-zA-Z0-9 ]+|#[a-zA-Z0-9 ]+|\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*|\W+|\d+|<("[^"]*"|\'[^\']*\'|[^\'">])*>|_+|[^\u0000-\u007f]+',
                 string=text,repl=" ")


In [19]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
  df["text"] = list(pool.map(remove_noisy_tokens,df["text"]))

In [20]:
df["text"]

Unnamed: 0,text
0,i am sure some bashers of pens fans are prett...
1,my brother is in the market for a high perform...
2,finally you said what you dream about mediter...
3,think it is the scsi card doing the dma trans...
4,i have an old jasmine drive which i cannot u...
...,...
9995,gk i hear that tires for this car can get real...
9996,tod i think you have misspoke if they are ban...
9997,yep sounds good to me suggestion sci electron...
9998,anyone familiar with this video card what chip...


## **Tokenization**

In [21]:
import nltk



In [22]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [23]:
from nltk import word_tokenize,sent_tokenize

In [24]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

  df["text"] = list(pool.map(word_tokenize,df["text"]))

In [25]:
df["text"]

Unnamed: 0,text
0,"[i, am, sure, some, bashers, of, pens, fans, a..."
1,"[my, brother, is, in, the, market, for, a, hig..."
2,"[finally, you, said, what, you, dream, about, ..."
3,"[think, it, is, the, scsi, card, doing, the, d..."
4,"[i, have, an, old, jasmine, drive, which, i, c..."
...,...
9995,"[gk, i, hear, that, tires, for, this, car, can..."
9996,"[tod, i, think, you, have, misspoke, if, they,..."
9997,"[yep, sounds, good, to, me, suggestion, sci, e..."
9998,"[anyone, familiar, with, this, video, card, wh..."


In [26]:
print(len(df["text"][0]))
print(type(df["text"][0]))

138
<class 'list'>


## **Removing StopWords**

In [27]:
import spacy

In [28]:
from spacy.lang.en import STOP_WORDS as spacy_st
from nltk.corpus import stopwords

In [29]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [30]:
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [31]:
nlp = spacy.load("en_core_web_md",disable=["ner","parser"])


In [32]:
combined_st = set(stopwords.words('english')).union(set(spacy_st))

In [33]:
len(combined_st)

401

In [34]:
def is_stopword(token):

  return token not in combined_st


In [35]:
def remove_stopwords(tokenized_text):

  return [token for token  in tokenized_text if is_stopword(token)]

In [36]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
  df["text"] = list(pool.map(remove_stopwords,df["text"]))

In [37]:
df['text']

Unnamed: 0,text
0,"[sure, bashers, pens, fans, pretty, confused, ..."
1,"[brother, market, high, performance, video, ca..."
2,"[finally, said, dream, mediterranean, new, are..."
3,"[think, scsi, card, dma, transfers, disks, scs..."
4,"[old, jasmine, drive, use, new, system, unders..."
...,...
9995,"[gk, hear, tires, car, expensive, gk, currentl..."
9996,"[tod, think, misspoke, banking, owning, motorc..."
9997,"[yep, sounds, good, suggestion, sci, electroni..."
9998,"[familiar, video, card, chipset, winjet, use, ..."


## **Lemmatization**

In [38]:
import tqdm

In [39]:
def lemmatize_text(tokenized_text):

  raw_text = " ".join(tokenized_text)
  doc = nlp(raw_text)
  lemmatized_text = []

  for token in doc:
    lemmatized_text.append(token.lemma_)

  return lemmatized_text

In [40]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
    df["text"] = list(pool.map(lemmatize_text, (df["text"])))

In [41]:
df['text']

Unnamed: 0,text
0,"[sure, basher, pen, fan, pretty, confused, lac..."
1,"[brother, market, high, performance, video, ca..."
2,"[finally, say, dream, mediterranean, new, area..."
3,"[think, scsi, card, dma, transfer, disk, scsi,..."
4,"[old, jasmine, drive, use, new, system, unders..."
...,...
9995,"[gk, hear, tire, car, expensive, gk, currently..."
9996,"[tod, think, misspoke, banking, own, motorcycl..."
9997,"[yep, sound, good, suggestion, sci, electronic..."
9998,"[familiar, video, card, chipset, winjet, use, ..."


In [42]:
converted_raw_text = list(df['text'].apply(lambda x:" ".join(x)))

In [43]:
converted_raw_text  = list(filter(lambda x: len(x) >0 , converted_raw_text))

In [44]:
len(converted_raw_text)

9705

In [45]:
converted_raw_text

['sure basher pen fan pretty confused lack kind post recent pen massacre devil actually bit puzzled bit relieved go end non pittsburgher relief bit praise pen man kill devil worse think jagr show well regular season stat lot fo fun watch playoff bowman let jagr lot fun couple game pen go beat pulp jersey disappointed islander lose final regular season game pen rule',
 'brother market high performance video card support vesa local bus mb ram suggestion idea diamond stealth pro local bus orchid farenheit ati graphic ultra pro high performance vlb card post email thank matt',
 'finally say dream mediterranean new area great year like holocaust number july usa sweden april cold change calendar mention true let true shall azeri woman child go pay price rape kill torture armenians hearde call geneva convention facist ohhh forget armenians fight forget killing raping torture kurds turks time ohhhh swedish redcross worker lie regional killer like person shoot policy l confuse search turkish pl

In [46]:
vocab = set()

for cleaned_text in converted_raw_text:
  vocab.update(set(cleaned_text.split(" ")))

In [47]:
len(vocab)

57634

In [48]:
vocab_list = list(vocab)

for i in range(len((vocab_list))):

  if i > 500:

    break

  print(vocab_list[i])

unheated
mason
dique
toy
cxhh
wavedit
supervise
nonintuitive
ofm
athula
moniker
paulson
uoveid
uwc
slickster
enlite
renegotiate
sakic
ephemeral
dns
vek
tue
ux
wizvax
sharpen
ridicule
saddam
sldenton
highend
nat
clumsily
catalina
qualifier
unison
cobra
racecar
hons
entanglements
friction
lyddy
brad
endpoint
cryptologia
class
vjjjj
qjw
herbison
dawe
purile
mandir
shingle
mvps
scoop
kbriggs
mjjjjjjjjjjjjjjj
w
ahlund
juelich
furriner
planitia
precious
exercise
methodological
bullpen
pronouns
poisonous
reluctance
jackass
ryxi
sla
mbgf
teriyaki
jzpkjz
rull
retaliate
stuppid
architeture
bitzm
dx
gsm
skyrocket
itti
pluto
thuy
unattacked
scucsy
spidery
grich
pornographer
phenomenom
melchizedek
mbecsbn
kohlmaas
accsys
ministries
bhjnuy
nslpi
astrup
vanbiesbrouk
visch
isreal
fph
footage
uptodate
qog
alternately
pawn
eup
daydream
casette
snell
reportedly
ethan
mowtu
alfonso
pinko
barnaby
czar
samx
rekindle
sisrfw
shoebox
segate
edd
xellf
preconfigured
cyclist
landlord
lj
pollute
intersecting
bottl

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()
vectorized_text = vec.fit_transform(converted_raw_text)

In [50]:
vectorized_text.shape

(9705, 57611)

In [51]:
from collections import Counter

In [52]:
cumulative_tfs = Counter()
for cleaned_text in df["text"]:

    cumulative_tfs.update(cleaned_text)

In [53]:
most_frequent_tokens = cumulative_tfs.most_common(5000)
most_frequent_tokens = dict(most_frequent_tokens)
truncated_vocab = list(most_frequent_tokens.keys())

truncated_vocab2idx = dict(zip(truncated_vocab,range(len(truncated_vocab))))

In [54]:
vec = TfidfVectorizer(vocabulary=truncated_vocab2idx)
vectorized_text = vec.fit_transform(converted_raw_text)



In [55]:
vectorized_text.shape

(9705, 5000)

In [56]:
vectorized_text[:500,1:5]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 0 stored elements and shape (500, 4)>

In [57]:
import numpy as np

In [58]:
E = np.matmul(((vectorized_text.toarray()).T),(vectorized_text.toarray()))

In [59]:
E.shape

(5000, 5000)

In [61]:
from sklearn.decomposition import TruncatedSVD
import numpy as np

svd = TruncatedSVD(n_components=5000)
E_reduced = svd.fit_transform(E)


singular_values = svd.singular_values_
explained_variance_ratio = svd.explained_variance_ratio_



In [63]:
print(len(singular_values))

5000


In [64]:
print(explained_variance_ratio)

[2.51637429e-01 4.44162839e-02 1.96197137e-02 ... 1.58807898e-31
 1.89257773e-32 2.26056893e-31]


In [65]:

cumulative_variance = np.cumsum(explained_variance_ratio)

num_components = np.searchsorted(cumulative_variance, 0.95) + 1

print(f"Number of components to retain 95% variance: {num_components}")


Number of components to retain 95% variance: 1120


In [66]:
svd = TruncatedSVD(n_components=num_components)
E_reduced = svd.fit_transform(E)

In [71]:
E_reduced.shape

(5000, 1120)

In [69]:
singular_values = svd.singular_values_

explained_variance_ratio = svd.explained_variance_ratio_

cumulative_variance = explained_variance_ratio.cumsum()

In [70]:
print(cumulative_variance)

[0.25163743 0.29605371 0.31567343 ... 0.94945133 0.94952318 0.94959477]


## **Getting The Word Embeddings**

## Vocabulary Alignment

In [72]:
words = list(truncated_vocab2idx.keys())

In [73]:
words

['ax',
 'x',
 'w',
 'q',
 'f',
 'v',
 'g',
 'c',
 'p',
 'know',
 'r',
 'b',
 'people',
 'like',
 'e',
 'think',
 'max',
 'time',
 'use',
 'k',
 'l',
 'n',
 'z',
 'good',
 'say',
 'h',
 'work',
 'new',
 'year',
 'go',
 'file',
 'u',
 'way',
 'edu',
 'system',
 'want',
 'come',
 'right',
 'look',
 'j',
 'get',
 'thing',
 'find',
 'problem',
 'need',
 'program',
 'try',
 'god',
 'run',
 'include',
 'point',
 'question',
 'mean',
 'post',
 'number',
 'tell',
 'window',
 'believe',
 'read',
 'drive',
 'well',
 'bit',
 'help',
 'com',
 'day',
 'start',
 'ask',
 'case',
 'state',
 'key',
 'give',
 'write',
 'long',
 'follow',
 'information',
 'support',
 'line',
 'image',
 'send',
 'see',
 'government',
 'take',
 'thank',
 'list',
 'set',
 'let',
 'available',
 'man',
 'mail',
 'etc',
 'change',
 'law',
 'power',
 'game',
 'group',
 'high',
 'call',
 'fact',
 'version',
 'sure',
 'base',
 'lot',
 'book',
 'word',
 'source',
 'hear',
 'space',
 'end',
 'old',
 'second',
 'software',
 'place',


In [74]:
word_embeddings = {word: E_reduced[i] for i , word in enumerate(words)}

In [75]:
word_embeddings

{'ax': array([ 0.04449443,  0.04641434,  0.00029172, ..., -0.00305546,
        -0.00250259,  0.00845448]),
 'x': array([0., 0., 0., ..., 0., 0., 0.]),
 'w': array([0., 0., 0., ..., 0., 0., 0.]),
 'q': array([0., 0., 0., ..., 0., 0., 0.]),
 'f': array([0., 0., 0., ..., 0., 0., 0.]),
 'v': array([0., 0., 0., ..., 0., 0., 0.]),
 'g': array([0., 0., 0., ..., 0., 0., 0.]),
 'c': array([0., 0., 0., ..., 0., 0., 0.]),
 'p': array([0., 0., 0., ..., 0., 0., 0.]),
 'know': array([ 2.92857178e+01,  1.20965832e-02, -2.06086761e+00, ...,
        -1.73294265e-03,  7.16829752e-03,  9.52674491e-03]),
 'r': array([0., 0., 0., ..., 0., 0., 0.]),
 'b': array([0., 0., 0., ..., 0., 0., 0.]),
 'people': array([ 2.38229976e+01, -1.03000190e+01, -2.57209489e+00, ...,
        -6.93837345e-03,  1.47236182e-03,  1.75318629e-02]),
 'like': array([ 2.61654330e+01,  5.25011575e-02,  1.59304293e+00, ...,
         2.94476475e-03, -2.02676666e-02,  9.10948476e-03]),
 'e': array([0., 0., 0., ..., 0., 0., 0.]),
 'think'

## Retrieving Embeddings for any word in vocabulary

In [79]:
input_word = input()

if input_word in word_embeddings:

  embedding = word_embeddings.get(input_word,"Word not found")

print(f"Embedding for {input_word} : {embedding}")

tend
Embedding for tend : [ 2.17435617 -0.57647185  0.16325718 ...  0.03891914  0.01458892
 -0.0107057 ]


## **Checking for Similar Words Using Cosine Similarity**

In [80]:
from sklearn.metrics.pairwise import cosine_similarity

In [81]:
def get_similar_words(target:str,top_n=5)->str:

  if target not in word_embeddings:

    return "Word not found in vocabulary"

  target_vector = word_embeddings[target].reshape(1,-1)
  similarities = cosine_similarity(target_vector,E_reduced)[0]

  sorted_indices = similarities.argsort()[::-1][1:top_n+1]
  similar_words = [words[i] for i in sorted_indices]

  return similar_words

## Testing the function

In [86]:
print(get_similar_words("conclusion"))

['context', 'argument', 'conclude', 'present', 'truth']


In [None]:
I