In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE

In [2]:
training_df = pd.read_csv('training.csv')
test_df = pd.read_csv('test.csv')

In [3]:
training_df.head() 

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [4]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  object
 1   label   16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB


In [5]:
# ALGORITHMS - Linear SVC, NaiveBayes

In [6]:
texts_train = training_df['text']
y_train = training_df['label']

texts_test = test_df['text']
y_test = test_df['label']

In [7]:
# Assigns weight based on word importance across all texts
vectorizer = TfidfVectorizer()

X_train_vectorized = vectorizer.fit_transform(texts_train)

X_test = vectorizer.transform(texts_test)

In [8]:
# Balance data
oversample = SMOTE()

X_train, y_train = oversample.fit_resample(X_train_vectorized, y_train)

In [9]:
# Linear SVC
svc = LinearSVC()

svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.91      0.93       581
           1       0.94      0.89      0.92       695
           2       0.71      0.89      0.79       159
           3       0.88      0.90      0.89       275
           4       0.84      0.82      0.83       224
           5       0.60      0.71      0.65        66

    accuracy                           0.89      2000
   macro avg       0.82      0.85      0.83      2000
weighted avg       0.89      0.89      0.89      2000



In [10]:
# NaiveBayes
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.85      0.89       581
           1       0.89      0.84      0.86       695
           2       0.57      0.81      0.67       159
           3       0.82      0.83      0.82       275
           4       0.85      0.75      0.80       224
           5       0.50      0.77      0.60        66

    accuracy                           0.83      2000
   macro avg       0.76      0.81      0.77      2000
weighted avg       0.85      0.83      0.83      2000

