In [3]:
#importing important libraries and modules and defining dataframe
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
#adding the latin encoding for compatibility with the type of data set
df =pd.read_csv('spam.csv',encoding='latin-1')
print(df)

        v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will Ì_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN  
1           NaN        NaN  


In [4]:
#checking for null_values
null_values = df.isnull().sum()
#count null values for eachn column
print(null_values)
#as shown there are 24 null values in the Income Column

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
#Define training Set
X = df.iloc[:, 1].values  # the second column
y = df.iloc[:, 0].values   # the first column


# Displaying the shapes of X and y to verify
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

#encoding X and y because machine learning models only work with numbers
#encoding the y with ham as 0 and spam as 1
label_mapping = {"ham": 0, "spam": 1}
y_enc = np.vectorize(label_mapping.get)(y)

#we vectorize the input(X) email strings array by using count vectorizer in scikit-learn. this is done by,
#breaking the text into individual words, then create a dictionary or array of all the unique words
#count the occurence of each unique word in the dictionary or array in each email string training example
vectorizer = CountVectorizer()
x_bow = vectorizer.fit_transform(X)
x_enc = x_bow.toarray()

print("Shape of X:", x_enc.shape)
print("Shape of y:", y_enc.shape)

Shape of X: (5572,)
Shape of y: (5572,)
Shape of X: (5572, 8672)
Shape of y: (5572,)


In [13]:
#NOTE: we do not scale this data because they are all in the same ranges, if not we'd be using standard scaler from
#scikit-learn to scale the data
#splitting training set for model evaluation after fitting is done to check for bias(underfitting) or variance(overfitting)
from sklearn.model_selection import train_test_split

# Get 60% of the dataset as the training set. Put the remaining 40% in temporary variables: x_ and y_.
x_train, x_, y_train, y_ = train_test_split(x_enc, y_enc, test_size=0.40, random_state=1)

# Split the 40% subset above into two: one half for cross validation and the other for the test set
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=1)

print(f"the shape of the training set (input) is: {x_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the cross validation set (input) is: {x_cv.shape}")
print(f"the shape of the cross validation set (target) is: {y_cv.shape}\n")
print(f"the shape of the test set (input) is: {x_test.shape}")
print(f"the shape of the test set (target) is: {y_test.shape}")

the shape of the training set (input) is: (3343, 8672)
the shape of the training set (target) is: (3343,)

the shape of the cross validation set (input) is: (1114, 8672)
the shape of the cross validation set (target) is: (1114,)

the shape of the test set (input) is: (1115, 8672)
the shape of the test set (target) is: (1115,)


In [14]:
#Note: we also do not need to increase the training set size because we already have a large training set size 
from sklearn.linear_model import LogisticRegression
#creating the regression model
lr_model = LogisticRegression()
#fitting the logisitic regression model to the data
lr_model.fit(x_train, y_train)

LogisticRegression()

In [16]:
#for model evaluation,we use this model paramters to predict the y labels for the training and cross validation set and checking for
#the percentage of errors(how close the prediction is to the target). this will help us identify if the model has overfit or underfit i.e if the cross validation
#has a high rate of error (much greater than the training set) then this means the model has high variance and has overfit the 
#set
#however if the train set has very high rate of error then the model has high bias and has underfit

#we then calculate the error for the train, cross-validation and test set with the fit model to check for 
#high bias(underfitting) and high variance(overfitting)
err_train = 1 -(lr_model.score(x_train, y_train))
err_cv = 1 -(lr_model.score(x_cv, y_cv))
err_test = 1 -(lr_model.score(x_test, y_test))

print("train error:", err_train)
print("cross validation error", err_cv)
print("test error", err_test)

train error: 0.0032904576727490475
cross validation error 0.02064631956912033
test error 0.02780269058295959


In [None]:
#So we see the model does very well on the train, cross validation and test set therefore this model doesn't have high bias or high
#variance and can be used for prediction