In [15]:
import pandas as pd

train_df = pd.read_csv("data/processed/train.csv")
test_df  = pd.read_csv("data/processed/test.csv")


In [2]:
import re
import emoji
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


nltk.download("stopwords")
nltk.download("wordnet")

stopwords_set = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juhoviljanen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/juhoviljanen/nltk_data...


In [3]:
def to_lower(text: str) -> str:
    if not isinstance(text, str):
        return ""
    return text.lower()


In [5]:
url_pattern = re.compile(r"http\S+|www\.\S+")

def remove_urls(text: str) -> str:
    return url_pattern.sub("", text)


In [6]:
mention_pattern = re.compile(r"@\w+")

def remove_mentions(text: str) -> str:
    return mention_pattern.sub("", text)


In [7]:
html_pattern = re.compile(r"<.*?>")

def remove_html(text: str) -> str:
    return html_pattern.sub("", text)


In [8]:
def convert_emojis(text: str) -> str:
    # üòÄ -> :grinning_face:
    return emoji.demojize(text, delimiters=(" ", " "))


In [9]:
punct_pattern = re.compile(r"[^\w\s]")

def remove_punctuation(text: str) -> str:
    # replace punctuation with space
    return punct_pattern.sub(" ", text)


In [10]:
digit_pattern = re.compile(r"\d+")

def remove_digits(text: str) -> str:
    return digit_pattern.sub("", text)


In [11]:
elong_pattern = re.compile(r"(.)\1{2,}")  # 3+ of same char

def normalize_elongated(text: str) -> str:
    return elong_pattern.sub(r"\1\1", text)


In [12]:
def tokenize_lemmatize(text: str) -> str:
    tokens = text.split()
    cleaned_tokens = [
        lemmatizer.lemmatize(tok)
        for tok in tokens
        if tok not in stopwords_set
    ]
    return " ".join(cleaned_tokens)


In [13]:
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""

    text = to_lower(text)
    text = remove_urls(text)
    text = remove_mentions(text)
    text = remove_html(text)
    text = convert_emojis(text)
    text = remove_punctuation(text)
    text = remove_digits(text)
    text = normalize_elongated(text)
    text = tokenize_lemmatize(text)

    return text


In [16]:
train_df["clean_text"] = train_df["text"].apply(clean_text)
test_df["clean_text"]  = test_df["text"].apply(clean_text)

train_df[["text", "clean_text"]].head()


Unnamed: 0,text,clean_text
0,I bet there is an army of married couples who ...,bet army married couple exact thing
1,This could only end badly.,could end badly
2,My sister squeezed a lime in her milk when she...,sister squeezed lime milk thing happened told ...
3,Thank you so much‚ù§Ô∏è,thank much red_heart
4,Stinks because ive been in this program for a ...,stink ive program year pay back drawing board


In [17]:
import os
os.makedirs("data/processed", exist_ok=True)

train_df.to_csv("data/processed/train_clean.csv", index=False)
test_df.to_csv("data/processed/test_clean.csv", index=False)


In [18]:
from sklearn.model_selection import train_test_split

X = train_df["clean_text"]
y = train_df["emotion"]

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # keeps emotion distribution balanced
)

len(X_train), len(X_val)


(38312, 9578)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=30000,    # adjust if RAM is an issue
    ngram_range=(1, 2),    # unigrams + bigrams
    sublinear_tf=True      # log(1 + tf), often helps
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf   = tfidf.transform(X_val)

X_train_tfidf.shape, X_val_tfidf.shape


((38312, 30000), (9578, 30000))

In [20]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report

svm_clf = LinearSVC(
    C=1.0,
    class_weight="balanced",  # helps if some emotions are rare
    random_state=42
)

svm_clf.fit(X_train_tfidf, y_train)

# predictions on validation set
y_val_pred = svm_clf.predict(X_val_tfidf)

macro_f1 = f1_score(y_val, y_val_pred, average="macro")
print("Validation Macro F1:", macro_f1)

print("\nClassification report:")
print(classification_report(y_val, y_val_pred))


Validation Macro F1: 0.3987649986360138

Classification report:
              precision    recall  f1-score   support

       anger       0.50      0.47      0.49      2139
     disgust       0.12      0.15      0.13       237
        fear       0.34      0.45      0.38       402
         joy       0.74      0.69      0.72      4759
     sadness       0.30      0.35      0.33       785
    surprise       0.33      0.37      0.35      1256

    accuracy                           0.55      9578
   macro avg       0.39      0.41      0.40      9578
weighted avg       0.57      0.55      0.56      9578



In [21]:
# Refit TF-IDF on ALL training clean_text
X_all = train_df["clean_text"]
y_all = train_df["emotion"]

X_all_tfidf = tfidf.fit_transform(X_all)

svm_clf_full = LinearSVC(
    C=1.0,
    class_weight="balanced",
    random_state=42
)

svm_clf_full.fit(X_all_tfidf, y_all)

# Transform test clean_text and predict
X_test_tfidf = tfidf.transform(test_df["clean_text"])
test_preds = svm_clf_full.predict(X_test_tfidf)

len(test_preds), test_df.shape


(16281, (16281, 5))

In [23]:
import joblib

joblib.dump(tfidf, "models/tfidf_vectorizer.pkl")
print("Saved TF-IDF vectorizer")


Saved TF-IDF vectorizer


In [25]:
svm_clf_full.fit(X_all_tfidf, y_all)


0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [26]:
X_test_tfidf = tfidf.transform(test_df["clean_text"])


In [27]:
test_preds = svm_clf_full.predict(X_test_tfidf)
len(test_preds)



16281

In [28]:
submission = test_df[["id"]].copy()
submission["emotion"] = test_preds


In [29]:
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv!")


Saved submission.csv!
