### Import the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Data Collection & Pre-Processing

In [3]:
# Load the data from csv file to pandas DataFrame
raw_mail_data = pd.read_csv(r'C:\Users\60115\Documents\Project\Python\Project3\Machine-Learning-Mail-Prediction\mail_data.csv')

In [5]:
raw_mail_data.head(20)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [7]:
raw_mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
# Replcae the null values with a null string

mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [9]:
mail_data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
# Checking the numbers of rows and columns

mail_data.shape

(5572, 2)

#### Label Encoding

In [14]:
# Label Spam mail as 0; Ham mail as 1

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

spam - 0 

ham - 1


In [15]:
# Separating the data as text and label

x = mail_data['Message'] 

y = mail_data['Category']

In [16]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [17]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

#### Splitting the Data into Training Data & Test Data

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=3)

In [20]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)


(5572,)
(4457,)
(1115,)


#### Feature Extraction

In [21]:
# Transform the text data to feature vectors that can be used as input to the Logistic regression
# stop_words will ignore all words necessary in english such auxiliary vers is,am,are
# mind_df means we will ignore the occurrance of a word if it is only 1 
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

# fit_transform to analyze the input by fitting and transform it into feature vectors
x_train_features = feature_extraction.fit_transform(x_train)
# .transform since this dataset will be new data that has been tested and analyzed by TfidfVector
x_test_features = feature_extraction.transform(x_test)

# convert y_train and y_test values as integer
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [23]:
print(x_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

### Training the Model

#### Logistic Regression Model

In [24]:
model = LogisticRegression()

In [25]:
# Training the Logistic Regression model with the training data
model.fit(x_train_features,y_train)

#### Evaluating the Trained Model

In [26]:
# Predict the Training Data

prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [27]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9670181736594121


In [29]:
# Predict the Test Data

prediction_on_test_data = model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

In [30]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9659192825112107


The accuracy score on both training and test data are important because in some cases our model may work overfitting which is a problem. 

It means that, in the case, the model performs very well on our training dataset. Our train_dataset will have had high accuracy score. However, when it comes to test the prediction on test_dataset. our model perform bad and generate low accuracies scores. 

For instance, we got 90 on train_dataset but only 60 on test_dataset. It means our model is overfitting which translate into overtrained from our dataset. It is why is important to check both accuracies

#### Building the Predictive System

In [32]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times."]

# Convert again the text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# Making prediction
prediction = model.predict(input_data_features)

if (prediction[0]==1):
    print("Ham mail")
else:
    print('Spam mail')


Ham mail
