# Import the Libraries

In [1]:
import numpy as np
import pandas as pd
# Training/Testing Model
from sklearn.model_selection import train_test_split
# Text to Feature Vectors (numerical values)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
# Accuracy score of Regression Model
from sklearn.metrics import accuracy_score

# Pre-processing of Data

In [5]:
# data gets loaded from csv file to a pd df
mailcsv_data = pd.read_csv('/Users/Vishal/Desktop/College Stuff/Cutie Hack 2021/mail_data.csv')

In [6]:
print(mailcsv_data)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [7]:
# convert the missing values into null strings
mailcsv_datanull = mailcsv_data.where((pd.notnull(mailcsv_data)),'')

In [11]:
# view first 5 entries of dataframe
mailcsv_datanull.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
# assign spam and ham binary values: spam = 0, ham, AKA nonspam, = 1
mailcsv_datanull.loc[mailcsv_datanull['Category'] == 'spam', 'Category',] = 0
mailcsv_datanull.loc[mailcsv_datanull['Category'] == 'ham', 'Category',] = 1

In [14]:
# separate the category and the actual message to feed 2 variables to the learning model
var_x = mailcsv_datanull['Message']
var_y = mailcsv_datanull['Category']

In [17]:
print(var_x)
print(var_y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


# Test and Train Split for the Model

In [19]:
# 4 arrays: 2 training for x and y, 2 testing for x and y
# 80% goes into training for x and y, 20% goes into testing for x and y
Train_X, Test_X, Train_Y, Test_Y = train_test_split(var_x, var_y, test_size = 0.2, random_state = 3)

# Convert text values into numerical values for the model

In [20]:
# text to feature vectors (numerical values) to create input for regression model
# reads all the words of the message: if a word is repeated a lot (spam), there will be a higher score value assigned
# min_df refers to the minimum score given to a word by the TfidfVectorizer function
# stop words refers to commonly used english words that are repeated but we can ignore (the, is, am, was, did, a)
text_to_featurevec = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

# convert X train and X test values into numbers and store it in variables
Train_X_features = text_to_featurevec.fit_transform(Train_X)
Test_X_features = text_to_featurevec.transform(Test_X)

# convert Y train and Y test values to integers since they are currently strings
Train_Y = Train_Y.astype('int')
Test_Y = Test_Y.astype('int')

In [21]:
print(Train_X)
print(Train_X_features)

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object
  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1,

# Train Logistic Regression Model

In [25]:
ML_model = LogisticRegression()
ML_model.fit(Train_X_features, Train_Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# View Accuracy of the Model

In [30]:
# ask model to predict the Y train values (check if it is spam or not spam)
predict_Y_Train = ML_model.predict(Train_X_features)
modelaccuracy_Y_Train = accuracy_score(Train_Y, predict_Y_Train)

In [31]:
print('Accuracy of model : ', modelaccuracy_Y_Train)

Accuracy of model :  0.9670181736594121


# Testing our Model

In [34]:
mail_test = ["Sunshine Quiz Wkly Q! Win a top Sony DVD player if u know which country the Algarve is in? Txt ansr to 82277. £1.50 SP:Tyrone"]

# text to numerical values
mail_text_converted = text_to_featurevec.transform(mail_test)

# predict spam or not spam: output represented as a list with one value (either 0 or 1)
model_pred = ML_model.predict(mail_text_converted)
print(model_pred)

if (model_pred[0]==0):
  print('This email is a spam!')

else:
  print('This email is not a spam!')

[0]
This email is a spam!
