In [1]:
import numpy as np 
import pandas as pd 

# for nlp
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob

# for stemming
from nltk.stem import PorterStemmer
stemming = PorterStemmer()

# for Lemmatizing
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizing = WordNetLemmatizer()

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# for machine learning
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

# save and load models
import pickle

# import warnings
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
df = pd.read_csv("assets/data/data_analyzed_df.csv")
del df['Unnamed: 0']
df.head(5)

Unnamed: 0,id,label,tweet,tidy_tweet,hashtag,word_count,char_count,avg_word,stopwords,hashtags
0,1,0.0,@user when a father is dysfunctional and is s...,dysfunctional selfish drags kids dysfunction #run,run,21,102,4.555556,10,1
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks #lyft credit cause offer wheelchair van...,lyft disapointed getthanked,22,122,5.315789,5,3
2,3,0.0,bihday your majesty,majesty,,5,21,5.666667,1,0
3,4,0.0,#model i love u take with u all the time in ...,#model,model,17,86,4.928571,5,1
4,5,0.0,factsguide: society now #motivation,factsguide society #motivation,motivation,8,39,8.0,1,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49159 entries, 0 to 49158
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          49159 non-null  int64  
 1   label       31962 non-null  float64
 2   tweet       49159 non-null  object 
 3   tidy_tweet  48810 non-null  object 
 4   hashtag     35894 non-null  object 
 5   word_count  49159 non-null  int64  
 6   char_count  49159 non-null  int64  
 7   avg_word    49159 non-null  float64
 8   stopwords   49159 non-null  int64  
 9   hashtags    49159 non-null  int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 3.8+ MB


In [4]:
train_df = df[0:31962]
test_df = df[31962:]

In [5]:
train_df = train_df[train_df['tidy_tweet'].notna()] 
train_df['token'] = train_df['tidy_tweet'].apply(lambda x: word_tokenize(x))
train_df.head(5)

Unnamed: 0,id,label,tweet,tidy_tweet,hashtag,word_count,char_count,avg_word,stopwords,hashtags,token
0,1,0.0,@user when a father is dysfunctional and is s...,dysfunctional selfish drags kids dysfunction #run,run,21,102,4.555556,10,1,"[dysfunctional, selfish, drags, kids, dysfunct..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks #lyft credit cause offer wheelchair van...,lyft disapointed getthanked,22,122,5.315789,5,3,"[thanks, #, lyft, credit, cause, offer, wheelc..."
2,3,0.0,bihday your majesty,majesty,,5,21,5.666667,1,0,[majesty]
3,4,0.0,#model i love u take with u all the time in ...,#model,model,17,86,4.928571,5,1,"[#, model]"
4,5,0.0,factsguide: society now #motivation,factsguide society #motivation,motivation,8,39,8.0,1,1,"[factsguide, society, #, motivation]"


In [6]:
train_df['tweet_stemmed'] = train_df['token'].apply(lambda x: ' '.join([stemming.stem(i) for i in x]))
train_df.head()

Unnamed: 0,id,label,tweet,tidy_tweet,hashtag,word_count,char_count,avg_word,stopwords,hashtags,token,tweet_stemmed
0,1,0.0,@user when a father is dysfunctional and is s...,dysfunctional selfish drags kids dysfunction #run,run,21,102,4.555556,10,1,"[dysfunctional, selfish, drags, kids, dysfunct...",dysfunct selfish drag kid dysfunct # run
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks #lyft credit cause offer wheelchair van...,lyft disapointed getthanked,22,122,5.315789,5,3,"[thanks, #, lyft, credit, cause, offer, wheelc...",thank # lyft credit caus offer wheelchair van ...
2,3,0.0,bihday your majesty,majesty,,5,21,5.666667,1,0,[majesty],majesti
3,4,0.0,#model i love u take with u all the time in ...,#model,model,17,86,4.928571,5,1,"[#, model]",# model
4,5,0.0,factsguide: society now #motivation,factsguide society #motivation,motivation,8,39,8.0,1,1,"[factsguide, society, #, motivation]",factsguid societi # motiv


In [7]:
train_df['tweet_lemmatized'] = train_df['token'].apply(lambda x: ' '.join([lemmatizing.lemmatize(i) for i in x]))
train_df.head()

Unnamed: 0,id,label,tweet,tidy_tweet,hashtag,word_count,char_count,avg_word,stopwords,hashtags,token,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,dysfunctional selfish drags kids dysfunction #run,run,21,102,4.555556,10,1,"[dysfunctional, selfish, drags, kids, dysfunct...",dysfunct selfish drag kid dysfunct # run,dysfunctional selfish drag kid dysfunction # run
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks #lyft credit cause offer wheelchair van...,lyft disapointed getthanked,22,122,5.315789,5,3,"[thanks, #, lyft, credit, cause, offer, wheelc...",thank # lyft credit caus offer wheelchair van ...,thanks # lyft credit cause offer wheelchair va...
2,3,0.0,bihday your majesty,majesty,,5,21,5.666667,1,0,[majesty],majesti,majesty
3,4,0.0,#model i love u take with u all the time in ...,#model,model,17,86,4.928571,5,1,"[#, model]",# model,# model
4,5,0.0,factsguide: society now #motivation,factsguide society #motivation,motivation,8,39,8.0,1,1,"[factsguide, society, #, motivation]",factsguid societi # motiv,factsguide society # motivation


In [8]:
# Bag Of Words
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
bow_vectorizer

CountVectorizer(max_df=0.9, max_features=1000, min_df=2, stop_words='english')

In [9]:
trainbow_stem = bow_vectorizer.fit_transform(train_df['tweet_stemmed'])
trainbow_stem

<31751x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 105529 stored elements in Compressed Sparse Row format>

In [10]:
trainbow_lemm = bow_vectorizer.fit_transform(train_df['tweet_lemmatized'])
trainbow_lemm

<31751x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 95453 stored elements in Compressed Sparse Row format>

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tfidf_vectorizer

TfidfVectorizer(max_df=0.9, max_features=1000, min_df=2, stop_words='english')

In [12]:
traintfidf_stem = tfidf_vectorizer.fit_transform(train_df['tweet_stemmed'])
traintfidf_stem.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
traintfidf_stem.toarray().shape

(31751, 1000)

In [14]:
traintfidf_lemm = tfidf_vectorizer.fit_transform(train_df['tweet_lemmatized'])
traintfidf_lemm.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
# feature selection of few columns
test_df = test_df[['id','tweet','tidy_tweet']]
test_df.head(2)

Unnamed: 0,id,tweet,tidy_tweet
31962,31963,#studiolife #aislife #requires #passion #dedic...,#studiolife #aislife #requires #passion #dedic...
31963,31964,@user #white #supremacists want everyone to s...,#white #supremacists everyone #birds #movie


In [16]:
test_df = test_df[test_df['tidy_tweet'].notna()]

In [17]:
test_df['token'] = test_df['tidy_tweet'].apply(lambda x: word_tokenize(x))
test_df.head(2)

Unnamed: 0,id,tweet,tidy_tweet,token
31962,31963,#studiolife #aislife #requires #passion #dedic...,#studiolife #aislife #requires #passion #dedic...,"[#, studiolife, #, aislife, #, requires, #, pa..."
31963,31964,@user #white #supremacists want everyone to s...,#white #supremacists everyone #birds #movie,"[#, white, #, supremacists, everyone, #, birds..."


In [18]:
test_df['tweet_stemmed'] = test_df['token'].apply(lambda x: ' '.join([stemming.stem(i) for i in x]))
test_df.head(2)

Unnamed: 0,id,tweet,tidy_tweet,token,tweet_stemmed
31962,31963,#studiolife #aislife #requires #passion #dedic...,#studiolife #aislife #requires #passion #dedic...,"[#, studiolife, #, aislife, #, requires, #, pa...",# studiolif # aislif # requir # passion # dedi...
31963,31964,@user #white #supremacists want everyone to s...,#white #supremacists everyone #birds #movie,"[#, white, #, supremacists, everyone, #, birds...",# white # supremacist everyon # bird # movi


In [19]:
testbow_stem = bow_vectorizer.fit_transform(test_df['tweet_stemmed'])
testbow_stem.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
test_df['tweet_lemmatized'] = test_df['token'].apply(lambda x: ' '.join([lemmatizing.lemmatize(i) for i in x]))
test_df.head(2)

Unnamed: 0,id,tweet,tidy_tweet,token,tweet_stemmed,tweet_lemmatized
31962,31963,#studiolife #aislife #requires #passion #dedic...,#studiolife #aislife #requires #passion #dedic...,"[#, studiolife, #, aislife, #, requires, #, pa...",# studiolif # aislif # requir # passion # dedi...,# studiolife # aislife # requires # passion # ...
31963,31964,@user #white #supremacists want everyone to s...,#white #supremacists everyone #birds #movie,"[#, white, #, supremacists, everyone, #, birds...",# white # supremacist everyon # bird # movi,# white # supremacist everyone # bird # movie


In [21]:
testbow_lemm = bow_vectorizer.fit_transform(test_df['tweet_lemmatized'])
testbow_lemm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
testtfidf_stem = tfidf_vectorizer.fit_transform(test_df['tweet_stemmed'])
testtfidf_lemm = tfidf_vectorizer.fit_transform(test_df['tweet_lemmatized'])

In [24]:
import joblib
joblib.dump(tfidf_vectorizer,'assets/data/tfidf_vectorizer.joblib')

['assets/data/tfidf_vectorizer.joblib']

In [25]:
X=traintfidf_lemm #x: predictors
y=train_df['label'] #y: label

xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=.3,random_state=42)

In [26]:
lr=LogisticRegression() # for lemmatized data
lr.fit(xtrain,ytrain)

LogisticRegression()

In [27]:
predict_lr=lr.predict(xtest)

In [28]:
print("accuracy score :", accuracy_score(predict_lr,ytest))

# calculating the f1 score for the validation set
print("f1 score :", f1_score(predict_lr,ytest))

print(confusion_matrix(predict_lr,ytest))
print(classification_report(predict_lr,ytest))

accuracy score : 0.9454125551123241
f1 score : 0.41704035874439455
[[8820  480]
 [  40  186]]
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97      9300
         1.0       0.28      0.82      0.42       226

    accuracy                           0.95      9526
   macro avg       0.64      0.89      0.69      9526
weighted avg       0.98      0.95      0.96      9526



Precision is very low for hate label due to less sample.

In [29]:
X1=traintfidf_stem # for stemmed data
y1=train_df['label']
x1train,x1test,y1train,y1test=train_test_split(X1,y1,test_size=.3,random_state=42)

In [30]:
lr1=LogisticRegression()
lr1.fit(x1train,y1train)

LogisticRegression()

In [31]:
predict_lr1=lr1.predict(x1test)

In [32]:
# accuracy score
print("accuracy score :", accuracy_score(predict_lr1,y1test))

# calculating the f1 score for the validation set
print("f1 score :", f1_score(predict_lr1,ytest))

print(confusion_matrix(predict_lr1,y1test))
print(classification_report(predict_lr1,y1test))

accuracy score : 0.9460424102456435
f1 score : 0.4237668161434977
[[8823  477]
 [  37  189]]
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97      9300
         1.0       0.28      0.84      0.42       226

    accuracy                           0.95      9526
   macro avg       0.64      0.89      0.70      9526
weighted avg       0.98      0.95      0.96      9526



Similar performance to lemmatization preprocessing.

In [33]:
svc=SVC()
svc.fit(xtrain,ytrain)
predict_svc=svc.predict(xtest)
# accuracy score
print("accuracy score :", accuracy_score(predict_svc,ytest))

# calculating the f1 score for the validation set
print("f1 score :", f1_score(predict_svc,ytest))

print(confusion_matrix(predict_svc,ytest))
print(classification_report(predict_svc,ytest))

accuracy score : 0.949086710056687
f1 score : 0.46408839779005523
[[8831  456]
 [  29  210]]
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97      9287
         1.0       0.32      0.88      0.46       239

    accuracy                           0.95      9526
   macro avg       0.66      0.91      0.72      9526
weighted avg       0.98      0.95      0.96      9526



In [35]:
nb=GaussianNB()
nb.fit(xtrain.toarray(),ytrain)
predict_nb=nb.predict(xtest.toarray())
# accuracy score
print("accuracy score :", accuracy_score(predict_nb,ytest))

# calculating the f1 score for the validation set
print("f1 score :", f1_score(predict_nb,ytest))

print(confusion_matrix(predict_nb,ytest))
print(classification_report(predict_nb,ytest))

accuracy score : 0.588074742809154
f1 score : 0.2293794186959937
[[5018   82]
 [3842  584]]
              precision    recall  f1-score   support

         0.0       0.57      0.98      0.72      5100
         1.0       0.88      0.13      0.23      4426

    accuracy                           0.59      9526
   macro avg       0.72      0.56      0.47      9526
weighted avg       0.71      0.59      0.49      9526



In [36]:
mlnb = MultinomialNB()
mlnb.fit(xtrain.toarray(),ytrain)
predict_mlnb=mlnb.predict(xtest.toarray())
# accuracy score
print("accuracy score :", accuracy_score(predict_mlnb,ytest))

# calculating the f1 score for the validation set
print("f1 score :", f1_score(predict_mlnb,ytest))

print(confusion_matrix(predict_mlnb,ytest))
print(classification_report(predict_mlnb,ytest))

accuracy score : 0.9441528448456855
f1 score : 0.3770491803278688
[[8833  505]
 [  27  161]]
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97      9338
         1.0       0.24      0.86      0.38       188

    accuracy                           0.94      9526
   macro avg       0.62      0.90      0.67      9526
weighted avg       0.98      0.94      0.96      9526



In [37]:
dt = DecisionTreeClassifier()
dt.fit(xtrain.toarray(),ytrain)
predict_dt = dt.predict(xtest.toarray())
# accuracy score
print("accuracy score :", accuracy_score(predict_dt,ytest))

# calculating the f1 score for the validation set
print("f1 score :", f1_score(predict_dt,ytest))

print(confusion_matrix(predict_dt,ytest))
print(classification_report(predict_dt,ytest))

accuracy score : 0.9461473861011968
f1 score : 0.5448092280390417
[[8706  359]
 [ 154  307]]
              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97      9065
         1.0       0.46      0.67      0.54       461

    accuracy                           0.95      9526
   macro avg       0.72      0.81      0.76      9526
weighted avg       0.96      0.95      0.95      9526



In [38]:
rf = RandomForestClassifier()
rf.fit(xtrain.toarray(),ytrain) # you can test with grid search methodology
predict_rf = rf.predict(xtest.toarray())
# accuracy score
print("accuracy score :", accuracy_score(predict_rf,ytest))

# calculating the f1 score for the validation set
print("f1 score :", f1_score(predict_rf,ytest))

print(confusion_matrix(predict_rf,ytest))
print(classification_report(predict_rf,ytest))

accuracy score : 0.952865840856603
f1 score : 0.5645004849660523
[[8786  375]
 [  74  291]]
              precision    recall  f1-score   support

         0.0       0.99      0.96      0.98      9161
         1.0       0.44      0.80      0.56       365

    accuracy                           0.95      9526
   macro avg       0.71      0.88      0.77      9526
weighted avg       0.97      0.95      0.96      9526



In [39]:
joblib.dump(rf,'assets/model/random_forest.joblib')

['assets/data/random_forest.joblib']