In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from sentence_transformers import SentenceTransformer

In [3]:
from xgboost import XGBClassifier

In [4]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\golik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\golik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\golik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
data = pd.read_csv("spam_ham_data.csv", encoding="latin-1")

In [8]:
data = data.drop(columns=["Unnamed: 2","Unnamed: 3", "Unnamed: 4"])

In [9]:
data.shape

(5572, 2)

In [10]:
X, y = data.v2, data.v1

In [11]:
target_mapping = {"ham": 0, "spam": 1}

In [12]:
y = y.map(target_mapping)

In [13]:
# sentence = data.iloc[1]["v2"]
# tokens = word_tokenize(sentence, language="english")
# tokens_without_punctuation = [word for word in tokens if word not in string.punctuation]
# english_stop_words = stopwords.words("english")
# tokens_without_stopwords_and_punctuation = [word for word in tokens_without_punctuation if word not in english_stop_words]

# snowball = SnowballStemmer(language="english")
# stemmed_tokens = [snowball.stem(word) for word in tokens_without_stopwords_and_punctuation]

# print(sentence)
# print(tokens)
# print(tokens_without_punctuation)
# print(tokens_without_stopwords_and_punctuation)
# print(stemmed_tokens)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [15]:
print(X_train.shape, X_test.shape)

(4179,) (1393,)


In [16]:
snowball = SnowballStemmer(language="english")
english_stop_words = stopwords.words("english")

def tokenize_text(text: str, remove_stop_words: bool = True):
    # Разделяем (токенизируем) отдельные элементы (буквы или символы) в тексте
    tokens = word_tokenize(text, language="english")
    # Убираем символы пунктуации
    tokens = [i for i in tokens if  i not in string.punctuation]
    # Убираем стоп слова (дополнительные слова в языках, которые не несут смысловой нагрузки)
    if remove_stop_words:
        tokens = [i for i in tokens if i not in english_stop_words]
    # Приводим к нижнему регитру и удаляем окончания
    tokens = [snowball.stem(i) for i in tokens]
    return tokens

In [17]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda text: tokenize_text(text, remove_stop_words=True),
    token_pattern=None
)

In [18]:
train_features = vectorizer.fit_transform(X_train)

In [19]:
train_features[:, 0]

<4179x1 sparse matrix of type '<class 'numpy.float64'>'
	with 98 stored elements in Compressed Sparse Row format>

# Обучение RandomForest модели на tf-idf

In [20]:
model = RandomForestClassifier(
    class_weight={0: 1, 1: 6.5},
    n_estimators=300,
    max_depth=4,
    min_samples_split=5,
    random_state=42
)

In [21]:
model.fit(train_features, y_train)

In [22]:
test_features = vectorizer.transform(X_test)
y_pred = model.predict(test_features)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1206
           1       0.96      0.84      0.90       187

    accuracy                           0.97      1393
   macro avg       0.97      0.92      0.94      1393
weighted avg       0.97      0.97      0.97      1393



# Инициализация эмбендинга

In [23]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [24]:
embeddings = embedding_model.encode(X_train.tolist())

# Обучение RandomForest модели на эмбендинге

In [25]:
model2 = RandomForestClassifier(
    class_weight={0: 1, 1: 6},
    n_estimators=300,
    max_depth=5,
    min_samples_split=5,
    random_state=42
)

In [26]:
model2.fit(embeddings, y_train)

In [27]:
test_features = embedding_model.encode(X_test.tolist())
y_pred = model2.predict(test_features)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1206
           1       0.94      0.89      0.91       187

    accuracy                           0.98      1393
   macro avg       0.96      0.94      0.95      1393
weighted avg       0.98      0.98      0.98      1393



# Обучение XGBoost модели на эмбендинге

In [28]:
model3 = XGBClassifier(
    scale_pos_weight=6.5, 
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    random_state=42e
)

In [29]:
model3.fit(embeddings, y_train)

In [30]:
test_features = embedding_model.encode(X_test.tolist())
y_pred = model3.predict(test_features)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1206
           1       0.97      0.93      0.95       187

    accuracy                           0.99      1393
   macro avg       0.98      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393



In [32]:
type(embeddings)

numpy.ndarray