In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re

# Step 1: Load the Data
spam_data = pd.read_csv("spam-data.csv")
features = spam_data[['Number of Words', 'Number of Links', 'Number of Capitalized Words', 'Number of Spam Words']]
labels = spam_data['Class']  # Ensure 'Class' is the column name for labels

# Step 2: Build and Train Logistic Regression Model
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))

# Step 3: Parse the `emails.txt` File and Extract Features
with open("emails.txt", 'r') as file:
    emails = file.read().split('----------------')

email_features = []
for email in emails:
    if email.strip():  # Check if the string is not just whitespace
        num_links = len(re.findall(r'http[s]?://\S+', email))
        num_words = len(re.findall(r'\w+', email))
        num_capitalized_words = len(re.findall(r'\b[A-Z]{2,}\b', email))
        num_spam_words = len(re.findall(r'\b(free|credit|offer|loan|winner|win|urgent)\b', email, flags=re.I))
        email_features.append([num_words, num_links, num_capitalized_words, num_spam_words])

# Step 4: Check Emails for Spam
email_features_df = pd.DataFrame(email_features, columns=['Number of Words', 'Number of Links', 'Number of Capitalized Words', 'Number of Spam Words'])
email_spam_predictions = model.predict(email_features_df)
for i, prediction in enumerate(email_spam_predictions):
    print(f"Email {i+1} is {'spam' if prediction else 'not spam'}")

# Step 5: Analyze the `spam-data.csv` File for Feature Importance
feature_importance = np.abs(model.coef_[0])
print("Feature Importance:\n", list(zip(features.columns, feature_importance)))
threshold = 0.1  # Arbitrary threshold for low importance
less_important_features = [features.columns[i] for i in range(len(feature_importance)) if feature_importance[i] < threshold]
print("Less Important Features:", less_important_features)


Accuracy: 0.9310344827586207
Email 1 is spam
Email 2 is not spam
Email 3 is spam
Feature Importance:
 [('Number of Words', 0.06945489225574682), ('Number of Links', 1.0684307821133459), ('Number of Capitalized Words', 0.4759124370163708), ('Number of Spam Words', 1.294608569856821)]
Less Important Features: ['Number of Words']
