In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [2]:
testdata = pd.read_csv('test.csv')
traindata = pd.read_csv('train.csv')

In [3]:
testdata.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [4]:
traindata.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
testdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17197 entries, 0 to 17196
Data columns (total 2 columns):
id       17197 non-null int64
tweet    17197 non-null object
dtypes: int64(1), object(1)
memory usage: 268.8+ KB


In [6]:
traindata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
id       31962 non-null int64
label    31962 non-null int64
tweet    31962 non-null object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [7]:
def clean_data(tweet):
    temp = TextBlob(tweet).words
    tb = ' '.join(temp)
    
    tweet_list = [ele for ele in tb.split() if ele != 'user']
    tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(tokens)
    clean_tweet = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
    
    lem = WordNetLemmatizer()
    normalized_tweet = []
    for word in tweet_list:
        normalized_text = lem.lemmatize(word,'v')
        normalized_tweet.append(normalized_text)
    return normalized_tweet
    
traindata['cleandata'] = traindata['tweet'].apply(clean_data)

In [8]:
traindata.head()

Unnamed: 0,id,label,tweet,cleandata
0,1,0,@user when a father is dysfunctional and is s...,"[when, a, father, be, dysfunctional, and, be, ..."
1,2,0,@user @user thanks for #lyft credit i can't us...,"[thank, for, lyft, credit, i, ca, n't, use, ca..."
2,3,0,bihday your majesty,"[bihday, your, majesty]"
3,4,0,#model i love u take with u all the time in ...,"[model, i, love, u, take, with, u, all, the, t..."
4,5,0,factsguide: society now #motivation,"[factsguide, society, now, motivation]"


In [9]:
def lst_to_str(s):
    listToStr = ' '.join([str(elem) for elem in s])
    return  listToStr
traindata['processed_data'] = traindata.cleandata.apply(lst_to_str)

In [10]:
traindata.head()

Unnamed: 0,id,label,tweet,cleandata,processed_data
0,1,0,@user when a father is dysfunctional and is s...,"[when, a, father, be, dysfunctional, and, be, ...",when a father be dysfunctional and be so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,"[thank, for, lyft, credit, i, ca, n't, use, ca...",thank for lyft credit i ca n't use cause they ...
2,3,0,bihday your majesty,"[bihday, your, majesty]",bihday your majesty
3,4,0,#model i love u take with u all the time in ...,"[model, i, love, u, take, with, u, all, the, t...",model i love u take with u all the time in urð...
4,5,0,factsguide: society now #motivation,"[factsguide, society, now, motivation]",factsguide society now motivation


In [11]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(traindata.processed_data)

In [12]:
Y = traindata.label

In [20]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3)

In [21]:
from sklearn.naive_bayes import *
from sklearn.metrics import *
modelNB = MultinomialNB()
modelNB.fit(X_train,Y_train)
Y_test_predict = modelNB.predict(X_test)
temp2 = mean_squared_error(Y_test,Y_test_predict)
print('Mean square error on testing data: ',temp2)
print("Accuracy score is: ",accuracy_score(Y_test_predict,Y_test))
print("Precision score is: ", precision_score(Y_test, Y_test_predict))
print("Recall score is: ", recall_score(Y_test, Y_test_predict))
print("f1 score is: ", f1_score(Y_test, Y_test_predict))
print("Confusion matrix: ")
print(confusion_matrix(Y_test_predict,Y_test))

Mean square error on testing data:  0.058608822609239754
Accuracy score is:  0.9413911773907603
Precision score is:  0.6028513238289206
Recall score is:  0.4464555052790347
f1 score is:  0.512998266897747
Confusion matrix: 
[[8731  367]
 [ 195  296]]


In [22]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, Y_train)
Y_test_predict = model.predict(X_test)
temp2 = mean_squared_error(Y_test,Y_test_predict)
print('Mean square error on testing data: ',temp2)
print('Accuracy score of survived: ',accuracy_score(Y_test,Y_test_predict))
print("Precision score is: ", precision_score(Y_test, Y_test_predict))
print("Recall score is: ", recall_score(Y_test, Y_test_predict))
print("f1 score is: ", f1_score(Y_test, Y_test_predict))
cm = confusion_matrix(Y_test, Y_test_predict)
print('Confusion matrix: ')
print(cm)



Mean square error on testing data:  0.03931588278235478
Accuracy score of survived:  0.9606841172176452
Precision score is:  0.848780487804878
Recall score is:  0.5248868778280543
f1 score is:  0.6486486486486487
Confusion matrix: 
[[8864   62]
 [ 315  348]]
