In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Matplotlib is building the font cache; this may take a moment.


In [2]:
DATA_PATH = "../data/raw/training.1600000.processed.noemoticon.csv"

df = pd.read_csv(
    DATA_PATH,
    encoding="latin-1",
    header=None
)

df.columns = ["target", "ids", "date", "flag", "user", "text"]

In [3]:
df = df[["target", "text"]]

In [4]:
df["sentiment"] = df["target"].map({0: 0, 4: 1})
df = df.drop(columns=["target"])

In [5]:
df.shape
df.isnull().sum()
df["sentiment"].value_counts()
df["text"].str.len().describe()

count    1.600000e+06
mean     7.409011e+01
std      3.644114e+01
min      6.000000e+00
25%      4.400000e+01
50%      6.900000e+01
75%      1.040000e+02
max      3.740000e+02
Name: text, dtype: float64

In [7]:
df.sample(5, random_state=49)

Unnamed: 0,text,sentiment
1443976,@snarkandboobs LOL at your last tweet. BTW-our...,1
735738,Kasabian tomorrow....I can't be excited until ...,0
1151914,Gonna hit the hay soon!!nighty night all,1
1325513,@ffyza : omg u r in tution class with chips? i...,1
1211563,"oliceo.fr in progress, chaud devant",1


In [10]:
df_small = (
    df.groupby("sentiment", group_keys=False)
      .sample(n=100000, random_state=42)
)


In [11]:
df_small["sentiment"].value_counts()
df_small.shape

(200000, 2)

In [15]:
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
sys.path.append(PROJECT_ROOT)

In [16]:
from src.preprocessing import set_seed, clean_text_basic, split_data

In [17]:
from src.preprocessing import set_seed, clean_text_basic, split_data

set_seed()

df_small["text_clean"] = df_small["text"].apply(clean_text_basic)

X_train, X_val, X_test, y_train, y_val, y_test = split_data(df_small)

In [18]:
y_train.value_counts(normalize=True)
y_val.value_counts(normalize=True)
y_test.value_counts(normalize=True)


sentiment
0    0.5
1    0.5
Name: proportion, dtype: float64

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [20]:
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)


In [22]:
tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 1),
    min_df=5
)


In [23]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)


In [24]:
clf = LogisticRegression(
    max_iter=1000,
    solver="liblinear"
)


In [25]:
clf.fit(X_train_tfidf, y_train)


In [26]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

val_preds = clf.predict(X_val_tfidf)
print("Val accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds))

test_preds = clf.predict(X_test_tfidf)
print("Test accuracy:", accuracy_score(y_test, test_preds))
print(classification_report(y_test, test_preds))


Val accuracy: 0.7878
              precision    recall  f1-score   support

           0       0.80      0.77      0.78     10000
           1       0.78      0.80      0.79     10000

    accuracy                           0.79     20000
   macro avg       0.79      0.79      0.79     20000
weighted avg       0.79      0.79      0.79     20000

Test accuracy: 0.7925
              precision    recall  f1-score   support

           0       0.80      0.78      0.79     10000
           1       0.79      0.80      0.79     10000

    accuracy                           0.79     20000
   macro avg       0.79      0.79      0.79     20000
weighted avg       0.79      0.79      0.79     20000



In [27]:
confusion_matrix(y_test, test_preds)


array([[7810, 2190],
       [1960, 8040]])