In [1]:
import pandas as pd

from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
id       31962 non-null int64
label    31962 non-null int64
tweet    31962 non-null object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [5]:

def clean_data(tweet):
    temp = TextBlob(tweet).words
    tb = ' '.join(temp)
    
    tweet_list = [ele for ele in tb.split() if ele != 'user']
    tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(tokens)
    clean_tweet = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
    
    lem = WordNetLemmatizer()
    normalized_tweet = []
    for word in tweet_list:
        normalized_text = lem.lemmatize(word,'v')
        normalized_tweet.append(normalized_text)
    return normalized_tweet
    
data['cleandata'] = data['tweet'].apply(clean_data)

In [7]:
data.head()

Unnamed: 0,id,label,tweet,cleandata
0,1,0,@user when a father is dysfunctional and is s...,"[when, a, father, be, dysfunctional, and, be, ..."
1,2,0,@user @user thanks for #lyft credit i can't us...,"[thank, for, lyft, credit, i, ca, n't, use, ca..."
2,3,0,bihday your majesty,"[bihday, your, majesty]"
3,4,0,#model i love u take with u all the time in ...,"[model, i, love, u, take, with, u, all, the, t..."
4,5,0,factsguide: society now #motivation,"[factsguide, society, now, motivation]"


In [8]:
def lst_to_str(s):
    listToStr = ' '.join([str(elem) for elem in s])
    return  listToStr
data['processed_data'] = data.cleandata.apply(lst_to_str)

In [9]:
data.head()

Unnamed: 0,id,label,tweet,cleandata,processed_data
0,1,0,@user when a father is dysfunctional and is s...,"[when, a, father, be, dysfunctional, and, be, ...",when a father be dysfunctional and be so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,"[thank, for, lyft, credit, i, ca, n't, use, ca...",thank for lyft credit i ca n't use cause they ...
2,3,0,bihday your majesty,"[bihday, your, majesty]",bihday your majesty
3,4,0,#model i love u take with u all the time in ...,"[model, i, love, u, take, with, u, all, the, t...",model i love u take with u all the time in urð...
4,5,0,factsguide: society now #motivation,"[factsguide, society, now, motivation]",factsguide society now motivation


In [12]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data.processed_data)

In [13]:
Y = data.label

In [14]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3)

In [15]:
from sklearn.naive_bayes import *
from sklearn.metrics import *
modelNB = MultinomialNB()
modelNB.fit(X_train,Y_train)
Y_test_predict = modelNB.predict(X_test)
temp2 = mean_squared_error(Y_test,Y_test_predict)
print('Mean square error on testing data: ',temp2)
print("Accuracy score is: ",accuracy_score(Y_test_predict,Y_test))
print("Precision score is: ", precision_score(Y_test, Y_test_predict))
print("Recall score is: ", recall_score(Y_test, Y_test_predict))
print("f1 score is: ", f1_score(Y_test, Y_test_predict))
print("Confusion matrix: ")
print(confusion_matrix(Y_test_predict,Y_test))

Mean square error on testing data:  0.05621024090103243
Accuracy score is:  0.9437897590989676
Precision score is:  0.6449579831932774
Recall score is:  0.4534711964549483
f1 score is:  0.5325238508239375
Confusion matrix: 
[[8743  370]
 [ 169  307]]
