In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, hamming_loss, f1_score
from gensim.models import Word2Vec

In [4]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

df = pd.read_csv('data/mLabel_tweets.csv')
df['tokens'] = df['tweet'].apply(clean_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df['label_list'] = df['labels'].apply(lambda x: str(x).split(' '))
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['label_list'])
print(f"Labels: {mlb.classes_}")

Labels: ['conspiracy' 'country' 'ineffective' 'ingredients' 'mandatory' 'none'
 'pharma' 'political' 'religious' 'rushed' 'side-effect' 'unnecessary']


In [6]:
X_train_tokens, X_test_tokens, y_train, y_test = train_test_split(
    df['tokens'], y, test_size=0.2, random_state=42
)

In [7]:
print("\nTraining Word2Vec...")
w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=2, workers=4)

print("Loading GloVe...")
embeddings_index = {}
try:
    with open('glove.6B.100d.txt', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
except FileNotFoundError:
    print("WARNING: 'glove.6B.100d.txt' not found. GloVe models will fail.")


Training Word2Vec...
Loading GloVe...


In [8]:
def get_vector(tokens, model_dict, vector_size):
    valid_vectors = [model_dict[word] for word in tokens if word in model_dict]
    if not valid_vectors:
        return np.zeros(vector_size)
    return np.mean(valid_vectors, axis=0)

print("Vectorizing data...")
X_train_w2v = np.array([get_vector(t, w2v_model.wv, 100) for t in X_train_tokens])
X_test_w2v = np.array([get_vector(t, w2v_model.wv, 100) for t in X_test_tokens])

Vectorizing data...


In [9]:
if embeddings_index:
    X_train_glove = np.array([get_vector(t, embeddings_index, 100) for t in X_train_tokens])
    X_test_glove = np.array([get_vector(t, embeddings_index, 100) for t in X_test_tokens])
else:
    X_train_glove, X_test_glove = None, None

In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, multilabel_confusion_matrix, roc_curve, auc

results = {}

models_config = [
    ("LR + Word2Vec", LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42), X_train_w2v, X_test_w2v),
    ("MLP + Word2Vec", MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42), X_train_w2v, X_test_w2v),
    ("LR + GloVe", LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42), X_train_glove, X_test_glove),
    ("MLP + GloVe", MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42), X_train_glove, X_test_glove)
]

In [11]:
for name, base_clf, X_tr, X_te in models_config:
    if X_tr is None:
        continue
        
    print(f"Processing {name}...")
    
    clf = MultiOutputClassifier(base_clf)
    clf.fit(X_tr, y_train)
    
    y_pred = clf.predict(X_te)
    
    try:
        probas_list = clf.predict_proba(X_te)
        y_score = np.array([p[:, 1] if p.shape[1] == 2 else np.zeros(p.shape[0]) for p in probas_list]).T
    except:
        y_score = np.zeros_like(y_pred) # Fallback if proba fails


    acc = accuracy_score(y_test, y_pred)
    ham = hamming_loss(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='micro')
    
    print(f"Accuracy:     {acc:.4f}")
    print(f"Hamming Loss: {ham:.4f}")
    print(f"F1 Score:     {f1:.4f}")

    results[name] = {'y_pred': y_pred, 'y_score': y_score}

Processing LR + Word2Vec...
Accuracy:     0.1879
Hamming Loss: 0.0895
F1 Score:     0.2967
Processing MLP + Word2Vec...
Accuracy:     0.1753
Hamming Loss: 0.0898
F1 Score:     0.2895
Processing LR + GloVe...
Accuracy:     0.3184
Hamming Loss: 0.0806
F1 Score:     0.4698
Processing MLP + GloVe...




Accuracy:     0.3471
Hamming Loss: 0.0882
F1 Score:     0.5207
