## Classificatio

<div dir=rtl style="text-align: left">

هدف: ساخت یک مدل برای تشخیص پیام‌های اسپم و غیر اسپم با استفاده از الگوریتم‌های Logistic Regression و Random Forest.

</div>

In [286]:
# =====================
# import libraries
# =====================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [294]:
# =====================
# load dataset
# =====================

# Load the dataset
df = pd.read_csv('SMSSpamCollection.csv', sep='\t', header=None, names=["label", "message"])


In [288]:
# =====================
# print info
# =====================

print('First rows of the dataset:')
print(df.head())

print('Shape of the dataset:')
print(df.shape)

print('Columns and data types:')
print(df.dtypes)

First rows of the dataset:
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Shape of the dataset:
(5572, 2)
Columns and data types:
label      object
message    object
dtype: object


In [289]:
# =====================
# Preprocessing
# =====================

# Convert labels into numeric values (spam = 1, ham = 0)
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# Check for missing values
print("Missing values:")
print(df.isna().sum())
print(df['label'].isna().sum())



Missing values:
label      0
message    0
dtype: int64
0


In [290]:
# =====================
# Text vectorization
# =====================

# Use TfidfVectorizer to convert text messages to numerical features
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['message'])

# Define target variable
y = df['label']

In [291]:
# =====================
# Train-Test Split
# =====================

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

Training set size: 4457 samples
Test set size: 1115 samples


In [292]:
# =====================
# Logistic Regression Model
# =====================

# Initialize Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Train the model
logreg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_logreg = logreg_model.predict(X_test)

# Evaluate the model
print("Logistic Regression Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_logreg):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg))
print("Classification Report:")
print(classification_report(y_test, y_pred_logreg))


Logistic Regression Evaluation:
Accuracy: 0.9767
Confusion Matrix:
[[955  11]
 [ 15 134]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       966
           1       0.92      0.90      0.91       149

    accuracy                           0.98      1115
   macro avg       0.95      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115

