# **Tokenization**

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.DataFrame({'text' : ['people watch campusx' , 'campusx watch campusx' , 'people write comment' , 'campusx write comment']
                   , 'output' : [1,1,0,0]})

In [None]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
sents = []
for i in df['text']:
  sents.append(sent_tokenize(i))

In [None]:
sents

[['people watch campusx'],
 ['campusx watch campusx'],
 ['people write comment'],
 ['campusx write comment']]

In [None]:
words = []
for i in df['text']:
  words.extend(word_tokenize(i))

In [None]:
words

['people',
 'watch',
 'campusx',
 'campusx',
 'watch',
 'campusx',
 'people',
 'write',
 'comment',
 'campusx',
 'write',
 'comment']

# **Lemmatization**

In [None]:
df = pd.DataFrame({'text' : ['people watching campusx watched' , 'campusx running saw campusx' , 'people write wrote comments' , 'campusx dancing dance liked comment']
                   , 'output' : [1,1,0,0]})

In [None]:
df

Unnamed: 0,text,output
0,people watching campusx watched,1
1,campusx running saw campusx,1
2,people write wrote comments,0
3,campusx dancing dance liked comment,0


In [None]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def lemma_words(text):
  return " ".join([lemma.lemmatize(word , pos ='v') for word in text.split()])

In [None]:
df['lemma_text'] = df['text'].apply(lemma_words)

In [None]:
df

Unnamed: 0,text,output,lemma_text
0,people watching campusx watched,1,people watch campusx watch
1,campusx running saw campusx,1,campusx run saw campusx
2,people write wrote comments,0,people write write comment
3,campusx dancing dance liked comment,0,campusx dance dance like comment


# **Bag of Words**

In [None]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [None]:
bow = cv.fit_transform(df['text'])

In [None]:
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comment': 1}


In [None]:
print(bow[0].toarray())
print(bow[1].toarray())

[[1 0 1 1 0]]
[[2 0 0 1 0]]


In [None]:
cv.transform(["campusx watch and write comment"]).toarray()

array([[1, 1, 0, 1, 1]])

# **N-grams or Bag of N-grams**

In [None]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


### **Bi-gram**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range = (2,2))

In [None]:
bi_gram = cv.fit_transform(df['text'])

In [None]:
print(cv.vocabulary_)

{'people watch': 2, 'watch campusx': 4, 'campusx watch': 0, 'people write': 3, 'write comment': 5, 'campusx write': 1}


In [None]:
print(bi_gram[0].toarray())

[[0 0 1 0 1 0]]


### **Bi-gram and uni-gram**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range = (1,2))

In [None]:
gram = cv.fit_transform(df['text'])

In [None]:
print(cv.vocabulary_)

{'people': 4, 'watch': 7, 'campusx': 0, 'people watch': 5, 'watch campusx': 8, 'campusx watch': 1, 'write': 9, 'comment': 3, 'people write': 6, 'write comment': 10, 'campusx write': 2}


In [None]:
print(gram[0].toarray())

[[1 0 0 0 1 1 0 1 1 0 0]]


### **Tri-gram**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range = (3,3))

In [None]:
tri_gram = cv.fit_transform(df['text'])

In [None]:
print(cv.vocabulary_)

{'people watch campusx': 2, 'campusx watch campusx': 0, 'people write comment': 3, 'campusx write comment': 1}


In [None]:
print(tri_gram[0].toarray())
print(tri_gram[1].toarray())
print(tri_gram[2].toarray())
print(tri_gram[3].toarray())

[[0 0 1 0]]
[[1 0 0 0]]
[[0 0 0 1]]
[[0 1 0 0]]


# **Tf-idf**

In [None]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [None]:
tf = tfidf.fit_transform(df['text'])

In [None]:
print(tfidf.get_feature_names_out())

['campusx' 'comment' 'people' 'watch' 'write']


In [None]:
print(tfidf.idf_)

[1.22314355 1.51082562 1.51082562 1.51082562 1.51082562]


In [None]:
print(tf[0].toarray())
print(tf[1].toarray())
print(tf[2].toarray())
print(tf[3].toarray())

[[0.49681612 0.         0.61366674 0.61366674 0.        ]]
[[0.8508161  0.         0.         0.52546357 0.        ]]
[[0.         0.57735027 0.57735027 0.         0.57735027]]
[[0.49681612 0.61366674 0.         0.         0.61366674]]


In [None]:
tfidf.fit_transform(df['text']).toarray()

array([[0.49681612, 0.        , 0.61366674, 0.61366674, 0.        ],
       [0.8508161 , 0.        , 0.        , 0.52546357, 0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.57735027],
       [0.49681612, 0.61366674, 0.        , 0.        , 0.61366674]])

# **Word2Vec**

In [None]:
df = pd.DataFrame({'text' : ['people watch campusx' , 'campusx watch campusx' , 'people write comment' , 'campusx write comment']
                   , 'output' : [1,1,0,0]})

In [None]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [None]:
text = []

for index,row in df.iterrows():
  text.append(row['text'])

In [None]:
text

['people watch campusx',
 'campusx watch campusx',
 'people write comment',
 'campusx write comment']

In [None]:
import gensim

from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [None]:
story = []

for sent in text:
  story.append(simple_preprocess(sent))

In [None]:
story

[['people', 'watch', 'campusx'],
 ['campusx', 'watch', 'campusx'],
 ['people', 'write', 'comment'],
 ['campusx', 'write', 'comment']]

In [None]:
len(story)

4

In [None]:
model = gensim.models.Word2Vec(
    window=1,
    min_count=2
)

In [None]:
model.build_vocab(story)

In [None]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(3, 60)

In [None]:
# Total unique words in trained model

len(model.wv.index_to_key)

5

In [None]:
model.wv['people']

array([-0.00713902,  0.00124103, -0.00717672, -0.00224462,  0.0037193 ,
        0.00583312,  0.00119818,  0.00210273, -0.00411039,  0.00722533,
       -0.00630704,  0.00464722, -0.00821997,  0.00203647, -0.00497705,
       -0.00424769, -0.00310898,  0.00565521,  0.0057984 , -0.00497465,
        0.00077333, -0.00849578,  0.00780981,  0.00925729, -0.00274233,
        0.00080022,  0.00074665,  0.00547788, -0.00860608,  0.00058446,
        0.00686942,  0.00223159,  0.00112468, -0.00932216,  0.00848237,
       -0.00626413, -0.00299237,  0.00349379, -0.00077263,  0.00141129,
        0.00178199, -0.0068289 , -0.00972481,  0.00904058,  0.00619805,
       -0.00691293,  0.00340348,  0.00020606,  0.00475375, -0.00711994,
        0.00402695,  0.00434743,  0.00995737, -0.00447374, -0.00138926,
       -0.00731732, -0.00969783, -0.00908026, -0.00102275, -0.00650329,
        0.00484973, -0.00616403,  0.00251919,  0.00073944, -0.00339215,
       -0.00097922,  0.00997913,  0.00914589, -0.00446183,  0.00

In [None]:
model.wv['people'].shape

(100,)

# **Average Word2Vec**

In [None]:
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

In [None]:
from tqdm import tqdm

X = []
for doc in tqdm(df['text'].values):
    X.append(document_vector(doc))

100%|██████████| 4/4 [00:00<00:00, 3197.49it/s]


In [None]:
X = np.array(X)

In [None]:
X[0].shape

(100,)