In [2]:
# importing required libraries
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('data/twitter_data.csv')

In [4]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
data.shape

(31962, 3)

In [6]:
data.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [7]:
train, test = train_test_split(data, test_size = 0.2, stratify = data['label'], random_state=21)

In [8]:
train.shape, test.shape

((25569, 3), (6393, 3))

In [9]:
train.label.value_counts(normalize=True)

0    0.929837
1    0.070163
Name: label, dtype: float64

In [10]:
test.label.value_counts(normalize=True)

0    0.929923
1    0.070077
Name: label, dtype: float64

In [11]:
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True,
                                                      max_features=1000,
                                                      stop_words= ENGLISH_STOP_WORDS)),
                            ('model', LogisticRegression())])

In [12]:
pipeline.fit(train.tweet, train.label)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another', 'any',
                                                       'anyhow', 'anyone',
           

In [13]:
predict_train=pipeline.predict(train.tweet)

In [14]:
predict_test=pipeline.predict(test.tweet)

In [15]:
# f1 score on train data
f1_score(y_true= train.label, y_pred= predict_train,average="micro")

0.9499393797176268

In [16]:
# f1 score on test data
f1_score(y_true= test.label, y_pred= predict_test,average="micro")

0.9480681995933051

In [17]:
from joblib import dump

In [18]:
dump(pipeline, filename="model.joblib")

['model.joblib']