In [6]:
# importing required libraries
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [7]:
df_sentiment = pd.read_csv('dataset/twitter_sentiments.csv')

In [8]:
df_sentiment.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [12]:
train, test = train_test_split(df_sentiment, test_size = 0.2, stratify = df_sentiment['label'], random_state=21)
train.shape, test.shape

((25569, 3), (6393, 3))

In [13]:
train.label.value_counts(normalize=False)

0    23775
1     1794
Name: label, dtype: int64

In [14]:
test.label.value_counts(normalize=False)

0    5945
1     448
Name: label, dtype: int64

In [15]:
tfidf_vectorizer = TfidfVectorizer(lowercase= True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)

In [16]:
tfidf_vectorizer.fit(train.tweet)

TfidfVectorizer(max_features=1000,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [17]:
train_idf = tfidf_vectorizer.transform(train.tweet)
test_idf  = tfidf_vectorizer.transform(test.tweet)

In [18]:
train_idf

<25569x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 105309 stored elements in Compressed Sparse Row format>

In [19]:
model_LR = LogisticRegression().fit(train_idf, train.label)

In [20]:
predict_train = model_LR.predict(train_idf)

In [21]:
predict_test = model_LR.predict(test_idf)

In [22]:
# model evaluation
f1_score(y_true= train.label, y_pred= predict_train)

0.4888178913738019

In [23]:
f1_score(y_true= test.label, y_pred= predict_test)

0.45751633986928114

In [24]:
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True,
                                                      max_features=1000,
                                                      stop_words= ENGLISH_STOP_WORDS)),
                            ('model', LogisticRegression())])

In [25]:
pipeline.fit(train.tweet, train.label)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another', 'any',
                                                       'anyhow', 'anyone',
           

In [29]:
pipeline.predict(train.tweet)

array([0, 0, 0, ..., 0, 0, 0])

In [28]:
l = ['Happy Tuesday everybody!', 'Happy Seamus 😃 sending each']

pipeline.predict(l)

array([0, 0])

In [30]:
text = ["Virat Kohli, AB de Villiers set to auction their 'Green Day' kits from 2016 IPL match to raise funds"]
pipeline.predict(text)

array([0])

In [36]:
from joblib import dump

dump(pipeline, filename="text_classification.joblib")


['text_classification.joblib']

In [33]:
df_sentiment[df_sentiment.label == 1]

Unnamed: 0,id,label,tweet
13,14,1,@user #cnn calls #michigan middle school 'buil...
14,15,1,no comment! in #australia #opkillingbay #se...
17,18,1,retweet if you agree!
23,24,1,@user @user lumpy says i am a . prove it lumpy.
34,35,1,it's unbelievable that in the 21st century we'...
...,...,...,...
31934,31935,1,lady banned from kentucky mall. @user #jcpenn...
31946,31947,1,@user omfg i'm offended! i'm a mailbox and i'...
31947,31948,1,@user @user you don't have the balls to hashta...
31948,31949,1,"makes you ask yourself, who am i? then am i a..."


In [35]:
df_sentiment[df_sentiment.label == 0]

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31956,31957,0,off fishing tomorrow @user carnt wait first ti...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
