# One Hot Encoding

In [1]:
# corpus ----------------> no of words in datasets
# vocabulary ------------> unique words
# document --------------> individual review
# word-------------------> individual word in a documents

In [31]:
d1 = "people watch campusx"
d2 = "campusx watch campusx"
d3 = "people write comment"
d4 = "campusx write comment"

In [32]:
import pandas as pd

In [33]:
df = pd.DataFrame({'text':["people watch campusx","campusx watch campusx","people write comment","campusx write comment"], 'output':[1,1,0,1]})

In [34]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,1


In [35]:
# Split text into individual words (tokens)
df_expanded = df['text'].str.get_dummies(sep=' ')
# print(df_expanded)
# Concatenate with the original DataFrame
df_final = pd.concat([df_expanded, df['output']], axis=1)

print(df_final)

   campusx  comment  people  watch  write  output
0        1        0       1      1      0       1
1        1        0       0      1      0       1
2        0        1       1      0      1       0
3        1        1       0      0      1       1


# Bag of words

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [37]:
bow = cv.fit_transform(df.text)

In [38]:
cv.vocabulary_

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comment': 1}

In [39]:
bow.toarray()

array([[1, 0, 1, 1, 0],
       [2, 0, 0, 1, 0],
       [0, 1, 1, 0, 1],
       [1, 1, 0, 0, 1]])

# N-grams

In [53]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))

In [54]:
bow = cv.fit_transform(df.text)

In [55]:
cv.vocabulary_

{'people watch': 2,
 'watch campusx': 4,
 'campusx watch': 0,
 'people write': 3,
 'write comment': 5,
 'campusx write': 1}

In [56]:
bow.toarray()

array([[0, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 0, 1]])

# Tf-Idf (Term frequency Inverce document frequency)

##### tf(t,d) = (Number of documents of term t in document d) / (total number of term in document d) 
##### idf(t) = log((total number of document in the corpus) / Number of documents with term t in them)  

In [64]:
# people watch campusx. ----> tf(people,d1) = 1/3
# campusx watch campusx ----> tf(campusx,d2) = 2,3
# people write comment
# campusx write comment

# idf(campusx) = log(4/3)

In [65]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,1


In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [67]:
vector=tfidf.fit_transform(df['text'])

In [68]:
vector.toarray()

array([[0.49681612, 0.        , 0.61366674, 0.61366674, 0.        ],
       [0.8508161 , 0.        , 0.        , 0.52546357, 0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.57735027],
       [0.49681612, 0.61366674, 0.        , 0.        , 0.61366674]])

In [69]:
tfidf.vocabulary_

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comment': 1}