# Spam Detector

In [103]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Retrieve the Data

The data is located at [https://static.bc-edx.com/mbc/ai/m4/datasets/spam-data.csv](https://static.bc-edx.com/mbc/ai/m4/datasets/spam-data.csv)

Dataset Source: [UCI Machine Learning Library](https://archive-beta.ics.uci.edu/dataset/94/spambase)


In [104]:
# Import the data
data = pd.read_csv("https://static.bc-edx.com/mbc/ai/m4/datasets/spam-data.csv")
data.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


I think that Random Forests Classifier will end up performing better than a Logistic Regression because there's an increased breadth of approaches towards classifying the data.

## Split the Data into Training and Testing Sets

In [105]:
# Create the labels set `y` and features DataFrame `X`

# i.e.,
#   Features -> Independent Variables
#   Label -> Dependent Variables / Target Column
#   So, we are determining the dependent variables from the independent variables.

X = data.copy()
y = data["spam"]

In [106]:
# Check the balance of the labels variable (`y`) by using the `value_counts` function.
#
# i.e., where 0 is not spam and is spam otherwise. it looks like theres 1 is 65% of 0.
# 
y.value_counts()

0    2788
1    1813
Name: spam, dtype: int64

In [107]:
# Split the data into X_train, X_test, y_train, y_test
#
# i.e., with train_test_split and positional arguments, split into respective data sets.
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Scale the Features

In [108]:
from sklearn.preprocessing import StandardScaler

# Create the StandardScaler instance
scaler = StandardScaler()

In [109]:
# Fit the Standard Scaler with the training data
scaler.fit(X_train)

In [110]:
# Scale the training data
scaler.transform(X_train)

array([[-3.35060914e-01, -1.65967111e-01, -5.58428618e-01, ...,
        -2.43705409e-01, -4.42553735e-01, -8.08630342e-01],
       [-7.47168576e-02, -3.84032060e-02, -4.01917084e-01, ...,
         1.57823987e-01,  2.66093168e+00, -8.08630342e-01],
       [-3.35060914e-01, -1.65967111e-01, -5.58428618e-01, ...,
         1.40965093e+00,  2.27711824e-01, -8.08630342e-01],
       ...,
       [ 4.78514262e-01, -1.65967111e-01,  4.39332414e-01, ...,
        -1.72847280e-01, -1.42547209e-03,  1.23665901e+00],
       [-3.35060914e-01, -1.65967111e-01, -5.58428618e-01, ...,
         4.55428128e-01, -2.00946383e-01,  1.23665901e+00],
       [-3.35060914e-01, -1.65967111e-01, -5.58428618e-01, ...,
        -1.30332403e-01, -3.22529438e-01,  1.23665901e+00]])

In [111]:
# Train a Logistic Regression model and print the model score
from sklearn.linear_model import LogisticRegression
logistic_regression_model = LogisticRegression(max_iter=3000)
logistic_regression_model.fit(X_train, y_train)


In [112]:
# Make and save testing predictions with the saved logistic regression model using the test data
testing_predictions = logistic_regression_model.predict(X_test)

# Review the predictions
testing_predictions

array([1, 0, 0, ..., 0, 0, 1])

In [113]:
# Calculate the accuracy score by evaluating `y_test` vs. `testing_predictions`.

print(f"Accuracy Score: {accuracy_score(y_test, testing_predictions)}")

Accuracy Score: 0.998262380538662


In [60]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)


In [65]:
# Make and save testing predictions with the saved logistic regression model using the test data
random_forest_predictions = random_forest_model.predict(X_test)
score = accuracy_score(y_test, random_forest_predictions)

# Review the predictions
print(f"Accuracy Score: {score}")


Accuracy Score: 1.0


In [None]:
#
# Testing three times:
#    i. LogisticRegression: 
#        i. 0.983 (98%)
#       ii. 0.991 (99%)
#      iii. 0.993 (99%)
#   ii. RandomForest:
#        i.  1.0
#        ii. 1.0
#       iii. 1.0
#  iii. Questions:
#       i. Which model performed better?
#         i. The RandomForestClassifier performed better than LogisticRegression. 
#      ii. How does that compare to my predictions?
#         i. When compared to my predictions, they were true.