In [1]:
import pandas as pd

# Считываем датасет
df = pd.read_csv("spam.csv", encoding='latin-1')[['v1','v2']]
df.columns = ['label','text']

# Преобразуем метку в бинарную
df['label'] = df['label'].map({'ham':0, 'spam':1})

# X/y
X = df['text']
y = df['label']

df.head()


Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Разделение на train/test

In [8]:

!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp314-cp314-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp314-cp314-macosx_12_0_arm64.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m1.9 MB/s[0m  [33m0:00:04[0m eta [36m0:00:01[0mm
[?25hDownloading joblib-1.5.2-py3-none-any.whl (308 kB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [scikit-learn][0m [scikit-learn]
[1A[2KSuccessfully installed joblib-1.5.2 scikit-learn-1.7.2 threadpoolctl-3.6.0


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


Бейзлайн (константная модель)

In [10]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score

baseline = DummyClassifier(strategy='most_frequent')  # всегда предсказывает чаще встречающийся класс
baseline.fit(X_train, y_train)
y_pred_baseline = baseline.predict(X_test)

print("F1-score бейзлайна:", f1_score(y_test, y_pred_baseline))


F1-score бейзлайна: 0.0


Преобразование текста и обучение простой модели

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(random_state=42))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("F1-score логистической регрессии:", f1_score(y_test, y_pred))


F1-score логистической регрессии: 0.8847583643122676


Более сложная модель с подбором гиперпараметров

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

pipe_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier(random_state=42))
])

params = {
    'clf__n_estimators':[50,100],
    'clf__max_depth':[10,20,None]
}

grid = GridSearchCV(pipe_rf, params, cv=3, scoring='f1')
grid.fit(X_train, y_train)

y_pred_rf = grid.predict(X_test)
print("Лучшие параметры:", grid.best_params_)
print("F1-score RandomForest:", f1_score(y_test, y_pred_rf))


Лучшие параметры: {'clf__max_depth': None, 'clf__n_estimators': 50}
F1-score RandomForest: 0.8796992481203008


Интерпретация модели (SHAP для RandomForest)

In [14]:
!pip install shap

Collecting shap
  Downloading shap-0.50.0-cp314-cp314-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting tqdm>=4.27.0 (from shap)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numba==0.63.0b1 (from shap)
  Downloading numba-0.63.0b1-cp314-cp314-macosx_10_15_universal2.whl.metadata (2.9 kB)
Collecting llvmlite==0.46.0b1 (from shap)
  Downloading llvmlite-0.46.0b1-cp314-cp314-macosx_11_0_universal2.whl.metadata (4.9 kB)
Collecting cloudpickle (from shap)
  Downloading cloudpickle-3.1.2-py3-none-any.whl.metadata (7.1 kB)
Downloading shap-0.50.0-cp314-cp314-macosx_11_0_arm64.whl (555 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m555.6/555.6 kB[0m [31m5.9 MB/s[0m  [33m0:00:00[0m
[?25hDownloading llvmlite-0.46.0b1-cp314-cp314-macosx_11_0_universal2.whl (37.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37