In [1]:
import os, sys

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from src.evaluation_metrics import topk_accuracy, precision_at_k

## Data Import

In [2]:
data = pd.read_csv('../data/cleaned.csv')
data.head()

Unnamed: 0,label,text,emoji_char,label_id,text_len
0,backhand_index_pointing_right,AirdropBox event for ecological users is here...,ðŸ‘‰,0,32
1,backhand_index_pointing_right,"Remember, success in online business is a mara...",ðŸ‘‰,0,27
2,backhand_index_pointing_right,Thanks for the update the sh*t country the sh*...,ðŸ‘‰,0,10
3,backhand_index_pointing_right,Hungry for active mutuals? Follow fastest Retw...,ðŸ‘‰,0,8
4,backhand_index_pointing_right,It's confirmed. Whitelist for Shardeum Airdrop...,ðŸ‘‰,0,40


In [3]:
print(data.shape)
print(len(data.label.unique()))
print(data.text[0])

(782125, 5)
40
AirdropBox event for  ecological users is here. A total of 550,000 addresses are eligible for , and 5 types of AirDropbox with different scarcity can be issued.

Invitation code: 52DC39
Airdrop Portal:


## Train Test Split

In [4]:
X = data["text"]
y = data["label_id"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Baseline Model

### Logistic Regression

In [5]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words="english"
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=200, n_jobs=-1)
clf.fit(X_train_tfidf, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,200


In [31]:
y_pred_train = clf.predict(X_train_tfidf)
y_pred_test = clf.predict(X_test_tfidf)

print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))

Training Accuracy: 0.22867828032603485
Test Accuracy: 0.19649672366949017


#### Multi-Class

In [8]:
logreg = LogisticRegression(
    max_iter=200,
    multi_class="multinomial",
    solver="lbfgs",
    class_weight="balanced"
)
logreg.fit(X_train_tfidf, y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,200


In [9]:
y_scores_train = logreg.predict_proba(X_train_tfidf)
y_scores_test = logreg.predict_proba(X_test_tfidf)

ks = [1, 3]
for k in ks:
    print(f"Logistic Regression Top-{k} Acc Train:", topk_accuracy(y_train, y_scores_train, k))
    print(f"Logistic Regression Top-{k} Acc Test:", topk_accuracy(y_test, y_scores_test, k))

# print(f"Precision@{k} Train:", precision_at_k(y_train, y_scores_train, k))
# print(f"Precision@{k} Test:", precision_at_k(y_test, y_scores_test, k))


Logistic Regression Top-1 Acc Train: 0.22840498641521495
Logistic Regression Top-1 Acc Test: 0.19663097330989293
Logistic Regression Top-3 Acc Train: 0.4069634009908902
Logistic Regression Top-3 Acc Test: 0.3511778807735336


### MLP

In [37]:
mlp = MLPClassifier(
    hidden_layer_sizes=(256,),
    activation='relu',
    batch_size=256,
    max_iter=20,
    random_state=42
)
mlp.fit(X_train_tfidf, y_train)



In [49]:
y_scores_train_mlp = mlp.predict_proba(X_train_tfidf)
y_scores_test_mlp = mlp.predict_proba(X_test_tfidf)

ks = [1, 3]
for k in ks:
    print(f"MLP Top-{k} Acc Train:", topk_accuracy(y_train, y_scores_train_mlp, k))
    print(f"MLP Top-{k} Acc Test:", topk_accuracy(y_test, y_scores_test_mlp, k))

# print(f"Precision@{k} Train:", precision_at_k(y_train, y_scores_train_mlp, k))
# print(f"Precision@{k} Test:", precision_at_k(y_test, y_scores_test_mlp, k))

MLP Top-1 Acc Train: 0.46645197378935593
MLP Top-1 Acc Test: 0.1809301582227905
MLP Top-3 Acc Train: 0.6566389643599169
MLP Top-3 Acc Test: 0.31713600767140804


In [6]:
mlp_early_stop = MLPClassifier(
    hidden_layer_sizes=(256,),
    activation='relu',
    batch_size=256,
    max_iter=20,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1
)
mlp_early_stop.fit(X_train_tfidf, y_train)

0,1,2
,hidden_layer_sizes,"(256,)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,256
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,20
,shuffle,True


In [None]:
y_scores_train_mlp_es = mlp_early_stop.predict_proba(X_train_tfidf)
y_scores_test_mlp_es = mlp_early_stop.predict_proba(X_test_tfidf)

ks = [1, 3]
for k in ks:
    print(f"MLP with early stop Top-{k} Acc Train:", topk_accuracy(y_train, y_scores_train_mlp_es, k))
    print(f"MLP with early stop Top-{k} Acc Test:", topk_accuracy(y_test, y_scores_test_mlp_es, k))


MLP Top-1 Acc Train: 0.24905545788716638
MLP Top-1 Acc Test: 0.19868946779606841
MLP Top-3 Acc Train: 0.4328528048585584
MLP Top-3 Acc Test: 0.35186830749560494
