In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv('text_df.csv', index_col=0)
X = df["text"]
y = df.drop(columns=["text"]).idxmax(axis=1)

In [3]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [4]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

In [5]:
svm_model_tfidf = LinearSVC()
svm_model_tfidf.fit(X_train_tfidf, y_train_tfidf)

In [7]:
y_pred_tfidf = svm_model_tfidf.predict(X_test_tfidf)
print(classification_report(y_test_tfidf, y_pred_tfidf))

              precision    recall  f1-score   support

       anger       0.86      0.86      0.86     19409
        fear       0.83      0.81      0.82     15318
         joy       0.89      0.91      0.90     44703
        love       0.75      0.74      0.74     13864
     sadness       0.90      0.91      0.91     39855
    surprise       0.69      0.62      0.65      6763

    accuracy                           0.86    139912
   macro avg       0.82      0.81      0.81    139912
weighted avg       0.86      0.86      0.86    139912



In [8]:
word_embeddings_model = Word2Vec(sentences=X, vector_size=100, window=5, min_count=1, workers=4)
word_embeddings = np.array([np.mean([word_embeddings_model.wv[word] for word in sentence.split() if word in word_embeddings_model.wv] or [np.zeros(100)], axis=0) for sentence in X])

In [9]:
X_train_word, X_test_word, y_train_word, y_test_word = train_test_split(word_embeddings, y, test_size=0.3, random_state=42)

In [10]:
pipeline = make_pipeline(StandardScaler(with_mean=False), #scaling helps faster convergence
                         SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3)) #hinge makes it a linear SVM
                                                                               #used this as it wiill run faster

pipeline.fit(X_train_word, y_train_word)

In [11]:
y_pred_word = pipeline.predict(X_test_word)
print(classification_report(y_test_word, y_pred_word))

              precision    recall  f1-score   support

       anger       0.10      0.03      0.04     19409
        fear       0.09      0.08      0.08     15318
         joy       0.33      0.91      0.49     44703
        love       0.35      0.00      0.00     13864
     sadness       0.31      0.00      0.01     39855
    surprise       0.26      0.01      0.01      6763

    accuracy                           0.31    139912
   macro avg       0.24      0.17      0.11    139912
weighted avg       0.27      0.31      0.17    139912

