# One Hot Encodings

In [2]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Sample categorical data
data = np.array([['Red'], ['Green'], ['Blue'], ['Red'], ['Blue']])

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)  # sparse_output=False returns an array

# Fit and transform the data
encoded_data = encoder.fit_transform(data)

In [3]:
encoded_data

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

# Bag of Words (BOW)

In [4]:
import numpy as np
import pandas as pd

In [5]:
df = pd.DataFrame({"text":["people watch dswithbappy",
                         "dswithbappy watch dswithbappy",
                         "people write comment",
                          "dswithbappy write comment"],"output":[1,1,0,0]})

df

Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [7]:
bow = cv.fit_transform(df['text'])

In [8]:
bow

<4x5 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [9]:
bow.toarray()

array([[0, 1, 1, 1, 0],
       [0, 2, 0, 1, 0],
       [1, 0, 1, 0, 1],
       [1, 1, 0, 0, 1]])

In [10]:
#vocabulary
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'dswithbappy': 1, 'write': 4, 'comment': 0}


# N-Grams

In [11]:
df = pd.DataFrame({"text":["people watch dswithbappy",
                         "dswithbappy watch dswithbappy",
                         "people write comment",
                          "dswithbappy write comment"],"output":[1,1,0,0]})

df

Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [12]:
# BI grams
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))

In [13]:
bow = cv.fit_transform(df['text'])

In [14]:
bow.toarray()

array([[0, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 0, 1]])

In [15]:
#vocabulary
print(cv.vocabulary_)

{'people watch': 2, 'watch dswithbappy': 4, 'dswithbappy watch': 0, 'people write': 3, 'write comment': 5, 'dswithbappy write': 1}


In [16]:
#Ti gram
# BI grams
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(3,3))

In [17]:
bow = cv.fit_transform(df['text'])

In [18]:
bow.toarray()

array([[0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0]])

In [19]:
print(cv.vocabulary_)

{'people watch dswithbappy': 2, 'dswithbappy watch dswithbappy': 0, 'people write comment': 3, 'dswithbappy write comment': 1}


# TF-IDF (Term frequency- Inverse document frequency)

In [20]:
df = pd.DataFrame({"text":["people watch dswithbappy",
                         "dswithbappy watch dswithbappy",
                         "people write comment",
                          "dswithbappy write comment"],"output":[1,1,0,0]})

df

Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid= TfidfVectorizer()

In [22]:
arr = tfid.fit_transform(df['text']).toarray()

In [23]:
arr

array([[0.        , 0.49681612, 0.61366674, 0.61366674, 0.        ],
       [0.        , 0.8508161 , 0.        , 0.52546357, 0.        ],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027],
       [0.61366674, 0.49681612, 0.        , 0.        , 0.61366674]])

# word2vec

data link: https://www.kaggle.com/datasets/khulasasndh/game-of-thrones-books

gdrive link: https://drive.google.com/file/d/1lbtAwzE7l0otXYFDtGUKKWzI83bD5D5H/view

In [24]:
import numpy as np
import pandas as pd
import gensim
import os

In [25]:
!pip install --upgrade gensim --user



In [26]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [27]:
story = []
for filename in os.listdir('data'):
    if filename == '.ipynb_checkpoints':
      pass
    f = open(os.path.join('data',filename))
    corpus = f.read()
    raw_sent = sent_tokenize(corpus)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [28]:
story

[['game',
  'of',
  'thrones',
  'book',
  'one',
  'of',
  'song',
  'of',
  'ice',
  'and',
  'fire',
  'by',
  'george',
  'martin',
  'prologue',
  'we',
  'should',
  'start',
  'back',
  'gared',
  'urged',
  'as',
  'the',
  'woods',
  'began',
  'to',
  'grow',
  'dark',
  'around',
  'them'],
 ['the', 'wildlings', 'are', 'dead'],
 ['do', 'the', 'dead', 'frighten', 'you'],
 ['ser',
  'waymar',
  'royce',
  'asked',
  'with',
  'just',
  'the',
  'hint',
  'of',
  'smile'],
 ['gared', 'did', 'not', 'rise', 'to', 'the', 'bait'],
 ['he',
  'was',
  'an',
  'old',
  'man',
  'past',
  'fifty',
  'and',
  'he',
  'had',
  'seen',
  'the',
  'lordlings',
  'come',
  'and',
  'go'],
 ['dead', 'is', 'dead', 'he', 'said'],
 ['we', 'have', 'no', 'business', 'with', 'the', 'dead'],
 ['are', 'they', 'dead'],
 ['royce', 'asked', 'softly'],
 ['what', 'proof', 'have', 'we'],
 ['will', 'saw', 'them', 'gared', 'said'],
 ['if',
  'he',
  'says',
  'they',
  'are',
  'dead',
  'that',
  'proof',


In [29]:
len(story)

8602

In [30]:
story[0]

['game',
 'of',
 'thrones',
 'book',
 'one',
 'of',
 'song',
 'of',
 'ice',
 'and',
 'fire',
 'by',
 'george',
 'martin',
 'prologue',
 'we',
 'should',
 'start',
 'back',
 'gared',
 'urged',
 'as',
 'the',
 'woods',
 'began',
 'to',
 'grow',
 'dark',
 'around',
 'them']

In [31]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [32]:
model.build_vocab(story)

In [33]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(322323, 447775)

In [34]:
vec = model.wv.get_normed_vectors()

In [35]:
vec

array([[-0.12185927,  0.0551987 ,  0.1073291 , ..., -0.09734634,
         0.08576265,  0.03854094],
       [-0.11551895,  0.05542772,  0.09900957, ..., -0.09485498,
         0.0822598 ,  0.02856984],
       [-0.07837901,  0.03892113,  0.07392693, ..., -0.08584101,
         0.07599054, -0.000911  ],
       ...,
       [-0.06175379,  0.12318765,  0.11097521, ..., -0.11080048,
         0.07667295,  0.05482703],
       [-0.07496568,  0.07608829,  0.04792746, ..., -0.11651568,
         0.05547886, -0.0139791 ],
       [-0.00883692,  0.10211421,  0.00364725, ..., -0.07165466,
         0.08514991, -0.00611831]], dtype=float32)

In [36]:
len(vec[0])

100

In [37]:
model.wv.most_similar('daenerys')

[('an', 0.999190628528595),
 ('other', 0.9991788268089294),
 ('three', 0.9991729855537415),
 ('dothraki', 0.9991684556007385),
 ('two', 0.9991657137870789),
 ('still', 0.9991631507873535),
 ('while', 0.9991628527641296),
 ('all', 0.9991617798805237),
 ('man', 0.9991515874862671),
 ('bran', 0.9991497993469238)]

In [38]:
model.wv.similarity('arya','sansa')

0.9997466

In [39]:

y = model.wv.index_to_key

In [47]:
y

['the',
 'and',
 'to',
 'he',
 'of',
 'his',
 'was',
 'her',
 'it',
 'in',
 'you',
 'had',
 'she',
 'that',
 'him',
 'with',
 'as',
 'said',
 'not',
 'at',
 'for',
 'on',
 'is',
 'they',
 'but',
 'from',
 'no',
 'were',
 'have',
 'my',
 'jon',
 'all',
 'be',
 'would',
 'them',
 'will',
 'there',
 'ned',
 'bran',
 'your',
 'when',
 'lord',
 'so',
 'are',
 'up',
 'me',
 'could',
 'man',
 'what',
 'king',
 'one',
 'if',
 'out',
 'this',
 'arya',
 'like',
 'been',
 'eyes',
 'back',
 'their',
 'did',
 'looked',
 'ser',
 'tyrion',
 'hand',
 'we',
 'then',
 'do',
 'than',
 'now',
 'down',
 'never',
 'page',
 'by',
 'see',
 'told',
 'boy',
 'father',
 'here',
 'only',
 'sansa',
 'brother',
 'an',
 'catelyn',
 'robb',
 'men',
 'or',
 'into',
 'robert',
 'old',
 'over',
 'even',
 'face',
 'long',
 'more',
 'stark',
 'away',
 'can',
 'black',
 'know',
 'too',
 'off',
 'before',
 'night',
 'time',
 'lady',
 'maester',
 'who',
 'lannister',
 'come',
 'dany',
 'where',
 'how',
 'still',
 'sword',
 '

In [40]:
from sklearn.decomposition import PCA

In [41]:
pca = PCA(n_components=3)

In [42]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [43]:
X

array([[-0.0068028 , -0.20882602, -0.00515228],
       [-0.02204734, -0.1589402 , -0.00405462],
       [-0.05297661,  0.03704819, -0.00201785],
       ...,
       [ 0.07673913,  0.04551347,  0.01411766],
       [-0.00165391, -0.07976355, -0.04995713],
       [ 0.02504677,  0.09981679,  0.02788476]], dtype=float32)

In [45]:
len(X[0])

3

In [46]:
import plotly.express as px
fig = px.scatter_3d(X[200:300],x=0,y=1,z=2, color=y[200:300])
fig.show()