In [2]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import numpy as np

In [3]:
# Descàrrega de recursos per a la lematització
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tonie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tonie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tonie\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
def eliminar_emojis(texto):
    if isinstance(texto, str):
        patron_emojis = re.compile(pattern="[" 
            "\U0001F600-\U0001F64F" "\U0001F300-\U0001F5FF"
            "\U0001F680-\U0001F6FF" "\U0001F700-\U0001F77F"
            "\U0001F780-\U0001F7FF" "\U0001F800-\U0001F8FF"
            "\U0001F900-\U0001F9FF" "\U0001FA00-\U0001FA6F"
            "\U0001FA70-\U0001FAFF" "\U00002702-\U000027B0"
            "\U000024C2-\U0001F251" "]+", flags=re.UNICODE)
        return patron_emojis.sub(r'', texto)
    return texto

# Lematització
lemmatizer = WordNetLemmatizer()
def obtener_pos_tag(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lematizar_texto(texto):
    palabras = nltk.word_tokenize(texto)
    palabras_lematizadas = [lemmatizer.lemmatize(p, obtener_pos_tag(p)) for p in palabras]
    return ' '.join(palabras_lematizadas)

# Dataset

In [5]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])
df['label'] = df['label'].replace({'ham': 0, 'spam': 1})
df['message'] = df['message'].apply(eliminar_emojis).apply(lematizar_texto)

# Stopwords
stopwords = ['a', 'an', 'the', 'in', 'on', 'at', 'to', 'of', 'and', 'or',
             'is', 'it', 'for', 'with', 'that', 'this', 'as', 'was', 'be',
             'are', 'were', 'been', 'from', 'by', 'about', 'into', 'out',
             'up', 'down', 'over', 'under', 'then', 'than', 'so', 'but', 'not']

X_text = df["message"]
y = df["label"]


  df['label'] = df['label'].replace({'ham': 0, 'spam': 1})


In [25]:

valors_k = [10]
resultats = []

In [27]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold



# 5-fold CV manual per evitar data leakage
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for k in valors_k:
    aucs = []
    for train_index, test_index in skf.split(X_text, y):
        X_train_raw, X_test_raw = X_text.iloc[train_index], X_text.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        vectorizer = CountVectorizer(max_features=10000, stop_words=stopwords)
        X_train_counts = vectorizer.fit_transform(X_train_raw)
        X_test_counts = vectorizer.transform(X_test_raw)

        lda = LatentDirichletAllocation(n_components=k, random_state=42)
        X_train_topics = lda.fit_transform(X_train_counts)
        X_test_topics = lda.transform(X_test_counts)
        model = XGBClassifier(
            max_depth=6,
            n_estimators=500,
            learning_rate=0.01,
            random_state=42
        )
        model.fit(X_train_topics, y_train)
        probs = model.predict_proba(X_test_topics)[:, 1]
        auc = roc_auc_score(y_test, probs)
        aucs.append(auc)
        print("iteració")

    resultats.append({"k": k, "auc_roc_mitjana": np.mean(aucs)})

df_resultats = pd.DataFrame(resultats)

iteració
iteració
iteració
iteració
iteració


In [28]:
df_resultats

Unnamed: 0,k,auc_roc_mitjana
0,10,0.965705
1,10,0.965705
