# –ó–∞–≥—Ä—É–∑–∫–∞ –±–∏–±–ª–∏–æ—Ç–µ–∫

In [72]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import joblib

# === 1. –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö ===

In [73]:
data = pd.read_csv("spam.csv", encoding="latin-1")
data = data[['v1', 'v2']]
data.columns = ['label', 'message']
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# === 2. –û—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞ ===

In [74]:
def clean_data(text):
    text = text.lower()                               # –≤ –Ω–∏–∂–Ω–∏–π —Ä–µ–≥–∏—Å—Ç—Ä
    text = re.sub(r"http\S+", " ", text)              # —É–±–∏—Ä–∞–µ–º —Å—Å—ã–ª–∫–∏
    text = re.sub(r"\d+", " ", text)                  # —É–±–∏—Ä–∞–µ–º —á–∏—Å–ª–∞
    text = text.translate(str.maketrans("", "", string.punctuation))  # —É–±–∏—Ä–∞–µ–º –ø—É–Ω–∫—Ç—É–∞—Ü–∏—é
    text = re.sub(r"\s+", " ", text).strip()          # —É–±–∏—Ä–∞–µ–º –ª–∏—à–Ω–∏–µ –ø—Ä–æ–±–µ–ª—ã
    return text

data['message'] = data['message'].apply(clean_data)

X = data['message']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# –û–ø—Ç–∏–º–∏–∑–∞—Ü–∏—è KNN

In [75]:
knn_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('knn', KNeighborsClassifier())
])

param_grid = {
    'tfidf__max_features': [5000, 10000],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'knn__n_neighbors': [3, 5, 7],
    'knn__metric': ['cosine', 'euclidean'],
    'knn__weights': ['uniform', 'distance']
}

# === –û–±—É—á–µ–Ω–∏–µ ===

In [76]:
knn_grid = GridSearchCV(knn_pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
knn_grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'knn__metric': ['cosine', 'euclidean'], 'knn__n_neighbors': [3, 5, ...], 'knn__weights': ['uniform', 'distance'], 'tfidf__max_features': [5000, 10000], ...}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_neighbors,3
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'cosine'
,metric_params,
,n_jobs,


# === 5. –û–±—É—á–µ–Ω–∏–µ k-NN ===

In [77]:
print("–õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã KNN:", knn_grid.best_params_)
y_pred = knn_grid.predict(X_test)
print("–¢–æ—á–Ω–æ—Å—Ç—å:", accuracy_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

–õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã KNN: {'knn__metric': 'cosine', 'knn__n_neighbors': 3, 'knn__weights': 'distance', 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}
–¢–æ—á–Ω–æ—Å—Ç—å: 0.9766816143497757
F1-score: 0.9057971014492754

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.98      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



# === –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ ===

In [78]:
joblib.dump(knn_grid.best_estimator_, 'knn_spam_model.pkl')
print("\n–ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –∫–∞–∫ 'knn_spam_model.pkl'")


–ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –∫–∞–∫ 'knn_spam_model.pkl'


# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä–∞ –æ—Ç–¥–µ–ª—å–Ω–æ (–¥–ª—è –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç–∏ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è –æ—Ç–¥–µ–ª—å–Ω–æ)

In [79]:
vectorizer = knn_grid.best_estimator_.named_steps['tfidf']
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä —Å–æ—Ö—Ä–∞–Ω–µ–Ω –∫–∞–∫ 'tfidf_vectorizer.pkl'")

–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ç–æ—Ä —Å–æ—Ö—Ä–∞–Ω–µ–Ω –∫–∞–∫ 'tfidf_vectorizer.pkl'


# =============================================================================
# –û–¢–î–ï–õ–¨–ù–ê–Ø –ü–†–û–í–ï–†–ö–ê –ú–û–î–ï–õ–ò –ù–ê –ù–û–í–´–• –î–ê–ù–ù–´–•
# =============================================================================

In [81]:
# –ó–∞–≥—Ä—É–∑–∫–∞ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏
loaded_model = joblib.load('knn_spam_model.pkl')

# –ü—Ä–∏–º–µ—Ä—ã —Å–æ–æ–±—â–µ–Ω–∏–π –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏
test_messages = [
    "Congratulations! You won $1000 prize! Call now to claim.",  # —Å–ø–∞–º
    "Hey, are we still meeting for lunch tomorrow?",  # –Ω–µ —Å–ø–∞–º
    "URGENT: Your bank account needs verification. Click here.",  # —Å–ø–∞–º
    "Hi mom, I'll be home late today. See you for dinner",  # –Ω–µ —Å–ø–∞–º
    "FREE iPhone waiting for you! Claim your prize now!",  # —Å–ø–∞–º
    "Meeting rescheduled to 3 PM. Please confirm attendance."  # –Ω–µ —Å–ø–∞–º
]

In [82]:
# –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –¥–ª—è —Ç–µ—Å—Ç–æ–≤—ã—Ö —Å–æ–æ–±—â–µ–Ω–∏–π
print("\n–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –¥–ª—è —Ç–µ—Å—Ç–æ–≤—ã—Ö —Å–æ–æ–±—â–µ–Ω–∏–π:")
print("-" * 50)

for i, message in enumerate(test_messages, 1):
    # –û—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞
    cleaned_message = clean_data(message)
    # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ
    prediction = loaded_model.predict([cleaned_message])[0]
    probability = loaded_model.predict_proba([cleaned_message])[0]
    
    spam_prob = probability[1] * 100  # –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞
    label = "–°–ü–ê–ú" if prediction == 1 else "–ù–ï –°–ü–ê–ú"
    
    print(f"{i}. –°–æ–æ–±—â–µ–Ω–∏–µ: {message[:50]}...")
    print(f"   –†–µ–∑—É–ª—å—Ç–∞—Ç: {label} (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: {spam_prob:.1f}%)")
    print()


–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –¥–ª—è —Ç–µ—Å—Ç–æ–≤—ã—Ö —Å–æ–æ–±—â–µ–Ω–∏–π:
--------------------------------------------------
1. –°–æ–æ–±—â–µ–Ω–∏–µ: Congratulations! You won $1000 prize! Call now to ...
   –†–µ–∑—É–ª—å—Ç–∞—Ç: –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 100.0%)

2. –°–æ–æ–±—â–µ–Ω–∏–µ: Hey, are we still meeting for lunch tomorrow?...
   –†–µ–∑—É–ª—å—Ç–∞—Ç: –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 0.0%)

3. –°–æ–æ–±—â–µ–Ω–∏–µ: URGENT: Your bank account needs verification. Clic...
   –†–µ–∑—É–ª—å—Ç–∞—Ç: –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 28.7%)

4. –°–æ–æ–±—â–µ–Ω–∏–µ: Hi mom, I'll be home late today. See you for dinne...
   –†–µ–∑—É–ª—å—Ç–∞—Ç: –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 0.0%)

5. –°–æ–æ–±—â–µ–Ω–∏–µ: FREE iPhone waiting for you! Claim your prize now!...
   –†–µ–∑—É–ª—å—Ç–∞—Ç: –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 67.1%)

6. –°–æ–æ–±—â–µ–Ω–∏–µ: Meeting rescheduled to 3 PM. Please confirm attend...
   –†–µ–∑—É–ª—å—Ç–∞—Ç: –ù–ï 

In [83]:
# –ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º —Å–æ—Ö—Ä–∞–Ω–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏
print("–ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ –∏—Å—Ö–æ–¥–Ω–æ–π —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ:")
print("-" * 40)

# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º —Ç–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ —Ç–∞–∫ –∂–µ, –∫–∞–∫ –ø—Ä–∏ –æ–±—É—á–µ–Ω–∏–∏
X_test_cleaned = X_test.apply(clean_data)
y_pred_loaded = loaded_model.predict(X_test_cleaned)

print(f"–¢–æ—á–Ω–æ—Å—Ç—å –∑–∞–≥—Ä—É–∂–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏: {accuracy_score(y_test, y_pred_loaded):.4f}")
print(f"F1-score –∑–∞–≥—Ä—É–∂–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏: {f1_score(y_test, y_pred_loaded):.4f}")

–ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ –∏—Å—Ö–æ–¥–Ω–æ–π —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ:
----------------------------------------
–¢–æ—á–Ω–æ—Å—Ç—å –∑–∞–≥—Ä—É–∂–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏: 0.9767
F1-score –∑–∞–≥—Ä—É–∂–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏: 0.9058


In [84]:
# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –∏–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω–æ–π –ø—Ä–æ–≤–µ—Ä–∫–∏
def check_spam_interactive():
    print("\n" + "="*50)
    print("–ò–ù–¢–ï–†–ê–ö–¢–ò–í–ù–ê–Ø –ü–†–û–í–ï–†–ö–ê –°–ü–ê–ú-–§–ò–õ–¨–¢–†–ê")
    print("="*50)
    print("–í–≤–µ–¥–∏—Ç–µ —Å–æ–æ–±—â–µ–Ω–∏–µ –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏ (–∏–ª–∏ 'quit' –¥–ª—è –≤—ã—Ö–æ–¥–∞):")
    
    while True:
        user_input = input("\n–í–∞—à–µ —Å–æ–æ–±—â–µ–Ω–∏–µ: ")
        if user_input.lower() == 'quit':
            break
            
        if user_input.strip():
            # –û—á–∏—Å—Ç–∫–∞ –∏ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ
            cleaned_input = clean_data(user_input)
            prediction = loaded_model.predict([cleaned_input])[0]
            probability = loaded_model.predict_proba([cleaned_input])[0]
            
            spam_prob = probability[1] * 100
            if prediction == 1:
                print(f"üö´ –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å: {spam_prob:.1f}%)")
            else:
                print(f"‚úÖ –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: {spam_prob:.1f}%)")


In [None]:
# –ó–∞–ø—É—Å–∫ –∏–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–Ω–æ–π –ø—Ä–æ–≤–µ—Ä–∫–∏
check_spam_interactive()

# –ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫ –¥–ª—è –¥–µ—Ç–∞–ª—å–Ω–æ–≥–æ –∞–Ω–∞–ª–∏–∑–∞
print("\n–ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫:")
cm = confusion_matrix(y_test, y_pred_loaded)
print(cm)
print("\n(0 - –Ω–µ —Å–ø–∞–º, 1 - —Å–ø–∞–º)")


–ò–ù–¢–ï–†–ê–ö–¢–ò–í–ù–ê–Ø –ü–†–û–í–ï–†–ö–ê –°–ü–ê–ú-–§–ò–õ–¨–¢–†–ê
–í–≤–µ–¥–∏—Ç–µ —Å–æ–æ–±—â–µ–Ω–∏–µ –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏ (–∏–ª–∏ 'quit' –¥–ª—è –≤—ã—Ö–æ–¥–∞):
‚úÖ –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 0.0%)
‚úÖ –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 0.0%)
‚úÖ –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 0.0%)
‚úÖ –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 33.3%)
‚úÖ –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 33.3%)
‚úÖ –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 0.0%)
‚úÖ –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 33.3%)
‚úÖ –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 33.3%)
‚úÖ –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 33.3%)
‚úÖ –ù–ï –°–ü–ê–ú (–≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Å–ø–∞–º–∞: 33.3%)

–ò–ù–§–û–†–ú–ê–¶–ò–Ø –û –ú–û–î–ï–õ–ò
–†–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è: 10000
–ú–µ—Ç—Ä–∏–∫–∞: cosine
–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å–æ—Å–µ–¥–µ–π: 3
–í–µ—Å: distance

–ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫:
[[964   2]
 [ 24 125]]

(0 - –Ω–µ —Å–ø–∞–º, 1 - 