Importing Libraries

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

Data Preprocessing

In [10]:
data=pd.read_csv('spam.csv',encoding='latin-1')

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Label      5572 non-null   object
 1   EmailText  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [21]:
data.isna().any()

Label        False
EmailText    False
dtype: bool

In [22]:
data.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Split the Dataset into Train and Test

In [None]:
# Split the data into features (X) and target (y)
X = data['EmailText']
y = data['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Model Development

In [14]:
# Initialize TF-IDF vectorizer and transform the text data
vectorizer = TfidfVectorizer(max_features=3000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Initialize and train the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_vec, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test_vec)


Accuracy: 0.98
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       0.99      0.83      0.90       150

    accuracy                           0.98      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.98      0.98      0.97      1115



Evaluation

In [None]:
# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate and print the classification report
class_report = classification_report(y_test, y_pred)
print(class_report)

Validation

In [15]:
# Validate the model with multiple test cases
sample_email_texts = [
    "To use your credit, click the WAP link in the next txt message",
    "Hey how are you Dear !",
    "URGENT! You have won a 1 week FREE membership in our Ã¥Â£100,000 Prize Jackpot!",
    "Important meeting tomorrow at 9 AM.",
]

sample_email_vecs = vectorizer.transform(sample_email_texts)
predicted_labels = rf_classifier.predict(sample_email_vecs)

for i, email_text in enumerate(sample_email_texts):
    if predicted_labels[i] == 'spam':
        print(f"Sample email {i+1}: '{email_text}' - is classified as spam.")
    else:
        print(f"Sample email {i+1}: '{email_text}' - is not classified as spam.")


Sample email 1: 'To use your credit, click the WAP link in the next txt message' - is classified as spam.
Sample email 2: 'Hey how are you Dear !' - is not classified as spam.
Sample email 3: 'URGENT! You have won a 1 week FREE membership in our Ã¥Â£100,000 Prize Jackpot!' - is classified as spam.
Sample email 4: 'Important meeting tomorrow at 9 AM.' - is not classified as spam.
