In [112]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [23]:
df = pd.read_csv('datasets/social_media_clean_text.csv', engine='python')

In [133]:
## EDA

In [24]:
print(df.shape)
df.sample(10)

(9282, 3)


Unnamed: 0,text,choose_one,class_label
1261,body bags,Not Relevant,0
9088,the police chief assured the crowd that this o...,Relevant,1
5067,yesterday's hailstorm! boston cambridge,Relevant,1
4073,question is anybody else having this problem...,Not Relevant,0
4019,we're hiring! click to apply staff registere...,Not Relevant,0
5779,perfect night for a soak! lava here i come??,Not Relevant,0
2219,uk bin laden family plane crashed after 'avoi...,Relevant,1
2575,can't believe ross is dead???????? emmerdal...,Not Relevant,0
283,'who then were annihilated by the legion itse...,Not Relevant,0
754,craving u,Not Relevant,0


In [28]:
df.describe()

Unnamed: 0,class_label
count,9282.0
mean,0.419953
std,0.49619
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,2.0


In [25]:
df['choose_one'].value_counts()

Not Relevant    5396
Relevant        3874
Can't Decide      12
Name: choose_one, dtype: int64

In [27]:
df['class_label'].value_counts()

0    5396
1    3874
2      12
Name: class_label, dtype: int64

In [29]:
df.isnull().sum()

text           0
choose_one     0
class_label    0
dtype: int64

In [30]:
df.groupby("class_label").count()

Unnamed: 0_level_0,text,choose_one
class_label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5396,5396
1,3874,3874
2,12,12


In [134]:
# train test split

In [51]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['class_label'], test_size=0.2, random_state=0)

In [45]:
print(len(X_train))
print(len(X_test))

7425
1857


In [46]:
X_train[:10]

979                                                stats 
8268                                        i'll bet i do
2817    listen to  demolish the case against planned p...
4129    chemical spill at gr water plant contained and...
9195    cia plot!  rolling eyes  rt  chinese relatives...
3913     yes  especially new clients that walk in and ...
8263    my room looks like a tornado passed through it...
7472     coursing  ' now they are causing di maria   lol'
4865     floods  ukfloods   british trekkers rescued a...
4121     'don't bother while you were offline i manage...
Name: text, dtype: object

In [48]:
y_test[:10]

8856    0
1123    0
3784    1
1942    0
7481    0
7682    0
8552    1
9154    0
5018    0
5920    1
Name: class_label, dtype: int64

In [135]:
# bag of words model
count_vectorizer = CountVectorizer()

bow = dict()
bow["train"] = (count_vectorizer.fit_transform(X_train), y_train)
bow["test"]  = (count_vectorizer.transform(X_test), y_test)


In [114]:
print(bow["train"][0].shape)
print(bow["test"][0].shape)

(7425, 15040)
(1857, 15040)


In [136]:
# TF-IDF model
tfidf_vectorizer = TfidfVectorizer()

tfidf = dict()
tfidf["train"] = (tfidf_vectorizer.fit_transform(X_train), y_train)
tfidf["test"]  = (tfidf_vectorizer.transform(X_test), y_test)

In [105]:
logistic_regression = LogisticRegression(C=30.0, solver='newton-cg', multi_class='multinomial', class_weight='balanced')

In [137]:
# Logistic Regression model with Bag of Words vectorizer
logistic_regression.fit(*bow["train"])
y_predict = logistic_regression.predict(bow["test"][0])

precision = precision_score(y_test, y_predict, pos_label=None, average='weighted')             

recall = recall_score(y_test, y_predict, pos_label=None, average='weighted')
    
f1 = f1_score(y_test, y_predict, pos_label=None, average='weighted')
    
accuracy = accuracy_score(y_test, y_predict)

print(f"accuracy = {accuracy:.3f}, precision = {precision:.3f}, recall = {recall:.3f}, f1 = {f1:.3f}")

accuracy = 0.768, precision = 0.767, recall = 0.768, f1 = 0.767


In [138]:
# Logistic Regression model with TF-IDF Vectorizer
logistic_regression.fit(*tfidf["train"])
y_predict = logistic_regression.predict(tfidf["test"][0])

precision = precision_score(y_test, y_predict, pos_label=None, average='weighted')             

recall = recall_score(y_test, y_predict, pos_label=None, average='weighted')
    
f1 = f1_score(y_test, y_predict, pos_label=None, average='weighted')
    
accuracy = accuracy_score(y_test, y_predict)

print(f"accuracy = {accuracy:.3f}, precision = {precision:.3f}, recall = {recall:.3f}, f1 = {f1:.3f}")

accuracy = 0.780, precision = 0.779, recall = 0.780, f1 = 0.779


In [121]:
linear_svc = LinearSVC(C=1.0, class_weight='balanced', multi_class='ovr', random_state=40)

In [139]:
# Linear Support Vector model with Bag of Words Vectorizer
linear_svc.fit(*bow["train"])
y_predict = linear_svc.predict(bow["test"][0])


precision = precision_score(y_test, y_predict, pos_label=None, average='weighted')             

recall = recall_score(y_test, y_predict, pos_label=None, average='weighted')
    
f1 = f1_score(y_test, y_predict, pos_label=None, average='weighted')
    
accuracy = accuracy_score(y_test, y_predict)

print(f"accuracy = {accuracy:.3f}, precision = {precision:.3f}, recall = {recall:.3f}, f1 = {f1:.3f}")

accuracy = 0.771, precision = 0.769, recall = 0.771, f1 = 0.769


In [140]:
# Linear Support Vector model with TF-IDF Vectorizer
linear_svc.fit(*tfidf["train"])
y_predict = linear_svc.predict(tfidf["test"][0])


precision = precision_score(y_test, y_predict, pos_label=None, average='weighted')             

recall = recall_score(y_test, y_predict, pos_label=None, average='weighted')
    
f1 = f1_score(y_test, y_predict, pos_label=None, average='weighted')
    
accuracy = accuracy_score(y_test, y_predict)

print(f"accuracy = {accuracy:.3f}, precision = {precision:.3f}, recall = {recall:.3f}, f1 = {f1:.3f}")

accuracy = 0.792, precision = 0.791, recall = 0.792, f1 = 0.791
