# Bag of words and TF-IDF

In [37]:
# pip install 

In [38]:
# importing libraries
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
import string

In [39]:
data=pd.read_csv('Toxic comment data kaggle/train.csv/train.csv')

In [40]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


#lets take only comments_text and toxic for now and implement our bags of word and TFIDF

In [41]:
data=data.drop(columns=['id','severe_toxic','obscene','threat','insult','identity_hate'],axis=1)

In [42]:
data= data[:10000]


In [43]:
data.shape

(10000, 2)

In [44]:
data.head()

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [45]:
nlp=spacy.load("en_core_web_sm")
stop_words=nlp.Defaults.stop_words
print(stop_words)

{'whereby', 'please', 'whence', 'neither', 'while', 'those', 'much', 'through', 'her', 'when', 'wherein', 'move', 'yourself', 'for', 'regarding', 'latterly', 'that', 'although', 'twelve', 'which', 'show', 'the', 'bottom', 'in', 'none', 'among', 'they', 'call', 'doing', 'seem', 'hundred', 'due', 'former', 'was', "'ll", 'onto', 'how', 'whereafter', 'every', 'nor', 'he', 'give', '‘ll', '’s', 'became', 'across', 'just', 'therein', 'after', 'many', 'between', 'always', 'several', 'where', 'next', '’ve', 'together', 'anyone', 'as', 'whose', 'an', 'three', 'into', 'were', 'full', 'alone', 'but', 'anything', 'from', 'with', 'mostly', 'my', 'down', 'behind', 'these', 'over', "'ve", 'side', 'noone', 'forty', 'formerly', 'by', 'seeming', 'often', 'more', '‘d', 'everywhere', 'i', 'n’t', 'else', 'rather', 'so', 'a', 'already', 'via', 'less', 'thereafter', 'whenever', 'back', 'however', 'had', 'is', 'own', 'itself', 'about', 'two', 'your', 'seems', 'take', 'thru', 'toward', 'above', 'nevertheless', 

In [46]:
Punctuations=string.punctuation
print(Punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [47]:
# Creating a function that will capture the exclude the stop word and punctutaions from our data
def spacy_fun(sentence):
    doc=nlp(sentence)#creating our token object which is used to create documents with linguistic component
    mytokens=[word.lemma_.lower().strip() for word in doc]# lemitazation each word and converting each token into lowercase
    mytokens=[word for word in mytokens if word not in stop_words and word not in Punctuations]# Removing stops words and punctuations
    return mytokens

In [48]:
# lets check our fucnction is working fine or not in a simple model to get understanding
sentence="I am learning Natural language processing"
spacy_fun(sentence)

['learn', 'natural', 'language', 'processing']

In [49]:
# It seems like it is working fine

In [50]:
count_vector=CountVectorizer(tokenizer=spacy_fun)# passing our function into countvectorizer

In [51]:
# Lets cee how countVectorizer works in a simple text

In [52]:
count_vector.fit_transform([" I am learning natural language processing" ,"I am enjoying it"]).toarray()

array([[0, 1, 1, 1, 1],
       [1, 0, 0, 0, 0]], dtype=int64)

In [53]:
count_vector.get_feature_names_out() # gives the list of feature of out text

array(['enjoy', 'language', 'learn', 'natural', 'processing'],
      dtype=object)

In [54]:
count_vector.vocabulary_

{'learn': 2, 'natural': 3, 'language': 1, 'processing': 4, 'enjoy': 0}

# Machine learning model

In [55]:
from sklearn.model_selection import train_test_split
X=data['comment_text']
y=data['toxic']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y)

In [56]:
from sklearn.linear_model import LogisticRegression

In [57]:
classifier=LogisticRegression()

In [58]:
X_train_vectors=count_vector.fit_transform(X_train)
X_test_vectors=count_vector.transform(X_test)

In [59]:
X_train_vectors.shape
X_test_vectors.shape

(2000, 29351)

In [60]:
X_train_vectors.toarray()
X_test_vectors.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [61]:
classifier.fit(X_train_vectors,y_train)

LogisticRegression()

In [62]:
predicted=classifier.predict(X_test_vectors)

In [63]:
print("Accuracy",metrics.accuracy_score(y_test, predicted))
print("Precision",metrics.precision_score(y_test, predicted))
print("Recall",metrics.recall_score(y_test, predicted))

Accuracy 0.9445
Precision 0.816793893129771
Recall 0.5515463917525774


# TF-IDF

In [64]:
tfidf_vector=TfidfVectorizer(tokenizer=spacy_fun)

In [65]:
X_train_vectors=tfidf_vector.fit_transform(X_train)
X_test_vectors=tfidf_vector.transform(X_test)

In [66]:
classifier=LogisticRegression()

In [67]:
classifier.fit(X_train_vectors,y_train)

LogisticRegression()

In [68]:
predicted=classifier.predict(X_test_vectors)

In [69]:
print("Accuracy",metrics.accuracy_score(y_test, predicted))
print("Precision",metrics.precision_score(y_test, predicted))
print("Recall",metrics.recall_score(y_test, predicted))

Accuracy 0.936
Precision 0.9852941176470589
Recall 0.34536082474226804
