<a href="https://colab.research.google.com/github/tusharj4/LMS/blob/main/spamemail.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Reading the content of the `spambase.names` file to understand the features
file_path_names = '/content/spambase.names'

with open(file_path_names, 'r') as file:
    spambase_names_content = file.read()

spambase_names_content


'| SPAM E-MAIL DATABASE ATTRIBUTES (in .names format)\n|\n| 48 continuous real [0,100] attributes of type word_freq_WORD \n| = percentage of words in the e-mail that match WORD,\n| i.e. 100 * (number of times the WORD appears in the e-mail) / \n| total number of words in e-mail.  A "word" in this case is any \n| string of alphanumeric characters bounded by non-alphanumeric \n| characters or end-of-string.\n|\n| 6 continuous real [0,100] attributes of type char_freq_CHAR\n| = percentage of characters in the e-mail that match CHAR,\n| i.e. 100 * (number of CHAR occurences) / total characters in e-mail\n|\n| 1 continuous real [1,...] attribute of type capital_run_length_average\n| = average length of uninterrupted sequences of capital letters\n|\n| 1 continuous integer [1,...] attribute of type capital_run_length_longest\n| = length of longest uninterrupted sequence of capital letters\n|\n| 1 continuous integer [1,...] attribute of type capital_run_length_total\n| = sum of length of unint

In [2]:
import pandas as pd

# Load the dataset
file_path_data = '/content/spambase.data'

# Extract the feature names from the spambase.names file content
feature_names = [
    "word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d", "word_freq_our",
    "word_freq_over", "word_freq_remove", "word_freq_internet", "word_freq_order", "word_freq_mail",
    "word_freq_receive", "word_freq_will", "word_freq_people", "word_freq_report", "word_freq_addresses",
    "word_freq_free", "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit",
    "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money", "word_freq_hp",
    "word_freq_hpl", "word_freq_george", "word_freq_650", "word_freq_lab", "word_freq_labs",
    "word_freq_telnet", "word_freq_857", "word_freq_data", "word_freq_415", "word_freq_85",
    "word_freq_technology", "word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct",
    "word_freq_cs", "word_freq_meeting", "word_freq_original", "word_freq_project", "word_freq_re",
    "word_freq_edu", "word_freq_table", "word_freq_conference", "char_freq_;", "char_freq_(",
    "char_freq_[", "char_freq_!", "char_freq_$", "char_freq_#", "capital_run_length_average",
    "capital_run_length_longest", "capital_run_length_total", "spam"
]

# Load the dataset into a pandas DataFrame
df = pd.read_csv(file_path_data, header=None, names=feature_names)

# Display the first few rows of the dataset
df.head()


Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Check for missing values in the dataset
missing_values = df.isnull().sum()

# If there are missing values, we will handle them. Otherwise, proceed.
imputer = SimpleImputer(strategy='mean')
df_imputed = imputer.fit_transform(df)

# Split the data into features and target variable
X = df_imputed[:, :-1]
y = df_imputed[:, -1]

# Normalize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Display the shape of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((3680, 57), (921, 57), (3680,), (921,))

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42)
}

# Train the models and evaluate their performance
results = {}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Store the results
    results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

results_df = pd.DataFrame(results).transpose()
print(results_df)


                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.919653   0.931694  0.874359  0.902116
Decision Tree        0.917481   0.917553  0.884615  0.900783
Random Forest        0.955483   0.975477  0.917949  0.945839
SVM                  0.934853   0.950820  0.892308  0.920635


In [5]:
# prompt: Select the Best Model

# Select the model with the highest F1 score
best_model_name, best_model_score = None, 0

for model_name, model_results in results.items():
    f1_score = model_results["F1 Score"]
    if f1_score > best_model_score:
        best_model_name, best_model_score = model_name, f1_score

# Print the best model and its F1 score
print(f"Best Model: {best_model_name}, F1 Score: {best_model_score}")


Best Model: Random Forest, F1 Score: 0.9458388375165125
