<a href="https://colab.research.google.com/github/tarang1998/ML-AI-DL/blob/main/gen-ai/text_representation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Bag of Words**

In [1]:
import numpy as np
import pandas as pd

In [9]:
df = pd.DataFrame({"text":["Tarang is really really awesome person",
                         "He is a awesome and great person",
                         "He is a great pile of shit",
                          "Do not attend his funeral"],"output":[1,1,0,0]})

df


Unnamed: 0,text,output
0,Tarang is really really awesome person,1
1,He is a awesome and great person,1
2,He is a great pile of shit,0
3,Do not attend his funeral,0


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [11]:
# Map each unique word in the corpus to an integer
bow = cv.fit_transform(df['text'])
print(cv.vocabulary_)


{'tarang': 15, 'is': 8, 'really': 13, 'awesome': 2, 'person': 11, 'he': 6, 'and': 0, 'great': 5, 'pile': 12, 'of': 10, 'shit': 14, 'do': 3, 'not': 9, 'attend': 1, 'his': 7, 'funeral': 4}


In [12]:
# Row represents the sentences in the corpus
# Colums represent each unique word
# Better than one-hot encoding - less no of dimensions
# Does not capture the semantic meaning of the text
bow.toarray()


array([[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 2, 0, 1],
       [1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
       [0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0]])

In [13]:
cv.transform(['Tarang is a beast']).toarray()


array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]])

**N-Grams**

In [14]:
# BI grams
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))
bow = cv.fit_transform(df['text'])
print(cv.vocabulary_)


{'tarang is': 17, 'is really': 11, 'really really': 16, 'really awesome': 15, 'awesome person': 3, 'he is': 7, 'is awesome': 9, 'awesome and': 2, 'and great': 0, 'great person': 5, 'is great': 10, 'great pile': 6, 'pile of': 14, 'of shit': 13, 'do not': 4, 'not attend': 12, 'attend his': 1, 'his funeral': 8}


In [15]:
bow.toarray()


array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1],
       [1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]])

**TF-IDF (Term frequency- Inverse document frequency)**

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid= TfidfVectorizer()
arr = tfid.fit_transform(df['text']).toarray()
arr

# No zeros are generated in the output - avoiding extra computation


array([[0.        , 0.        , 0.30571917, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.24750601, 0.        ,
        0.        , 0.30571917, 0.        , 0.775532  , 0.        ,
        0.387766  ],
       [0.5067739 , 0.        , 0.39954636, 0.        , 0.        ,
        0.39954636, 0.39954636, 0.        , 0.32346721, 0.        ,
        0.        , 0.39954636, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.36559366, 0.36559366, 0.        , 0.29597957, 0.        ,
        0.46370919, 0.        , 0.46370919, 0.        , 0.46370919,
        0.        ],
       [0.        , 0.4472136 , 0.        , 0.4472136 , 0.4472136 ,
        0.        , 0.        , 0.4472136 , 0.        , 0.4472136 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

**Word2Vec**

In [24]:
import kagglehub
import os
import pandas as pd
import gensim
import os

!pip install --upgrade gensim --user

from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import nltk
nltk.download('punkt_tab')




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [29]:
# Download latest version
path = kagglehub.dataset_download("khulasasndh/game-of-thrones-books")
print("Path to dataset files:", path)

files = os.listdir(path)
print("Files in the dataset folder:", files)

story = []

for filename in os.listdir(path):

    if filename.endswith(".txt"):  # Only process text files

        file_path = os.path.join(path, filename)
        print(f"Processing file: {filename}")

        try:
            # Open the file with a specific encoding and handle errors
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                corpus = f.read()
                raw_sent = sent_tokenize(corpus)
                for sent in raw_sent:
                  story.append(simple_preprocess(sent))
        except Exception as e:
            print(f"Error reading file {filename}: {e}")






Path to dataset files: /root/.cache/kagglehub/datasets/khulasasndh/game-of-thrones-books/versions/1
Files in the dataset folder: ['001ssb.txt', '004ssb.txt', '005ssb.txt', '003ssb.txt', '002ssb.txt']
Processing file: 001ssb.txt
Processing file: 004ssb.txt
Processing file: 005ssb.txt
Processing file: 003ssb.txt
Processing file: 002ssb.txt


In [31]:
len(story)

158874

In [30]:
story[1]

['the', 'wildlings', 'are', 'dead']

In [32]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)
model.build_vocab(story)
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)


(6575286, 8625265)

In [33]:
model.wv.most_similar('daenerys')


[('stormborn', 0.8071023225784302),
 ('targaryen', 0.8032286167144775),
 ('unburnt', 0.7723831534385681),
 ('viserys', 0.7209525108337402),
 ('princess', 0.7195443511009216),
 ('myrcella', 0.7098774313926697),
 ('rhaegar', 0.7016276717185974),
 ('elia', 0.692600667476654),
 ('queen', 0.6907302141189575),
 ('aegon', 0.6860072016716003)]

In [34]:
model.wv.similarity('arya','sansa')


0.85599554

In [35]:
model.wv['deep'].shape


(100,)

In [36]:
vec = model.wv.get_normed_vectors()
vec


array([[-0.02807936, -0.012061  , -0.02625382, ..., -0.15034764,
         0.03640831,  0.06736896],
       [ 0.07035901,  0.010056  ,  0.03296717, ..., -0.02854405,
        -0.11097072,  0.12453714],
       [ 0.03313189, -0.11080637, -0.00193292, ...,  0.0191323 ,
         0.13232839, -0.10092854],
       ...,
       [ 0.09787254,  0.15496238, -0.05561205, ...,  0.01260513,
        -0.09236485, -0.00178123],
       [ 0.05950408,  0.14360756,  0.03176214, ..., -0.01065816,
        -0.06090218,  0.01455566],
       [-0.03135958,  0.09034059,  0.00168601, ...,  0.04622844,
         0.02633695, -0.01683958]], dtype=float32)

In [43]:
model.wv.get_normed_vectors().shape


(17869, 100)

In [45]:
y = model.wv.index_to_key
len(y)

17869

In [46]:
# Reduce the dimensions from 100 to 3
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
X = pca.fit_transform(model.wv.get_normed_vectors())
X

array([[-0.20991799,  0.6025897 ,  0.0454001 ],
       [-0.21676531,  0.3469038 , -0.0499628 ],
       [ 0.26117516,  0.5618674 , -0.24978514],
       ...,
       [ 0.36718932, -0.16675675, -0.23843375],
       [ 0.17085674, -0.16734403,  0.17819218],
       [ 0.19571398, -0.23619044,  0.13211982]], dtype=float32)

In [47]:
X.shape


(17869, 3)

In [48]:
import plotly.express as px
fig = px.scatter_3d(X[200:300],x=0,y=1,z=2, color=y[200:300])
fig.show()
