This is a basic project to classify mails into spam or ham using logistic regression

In [9]:
import os
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

First we import all the required libraries

In [10]:
data = pd.read_csv("spam.csv",encoding='latin-1')

Then we import the data 

In [11]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [12]:
data['Unnamed: 2'].count()

50

In [13]:
data['Unnamed: 3'].count()

12

In [14]:
data['Unnamed: 4'].count()

6

So we see 3 NaN fields which we drop for our first model as the amount of data in them is very less.In required they can be incorporated later for further analysis

In [15]:
data.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [16]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Now lets change the column names for better representation.We change v1 to 'Labels' and v2 to 'Message'

In [17]:
data.rename(columns={'v1':'Label','v2':'Message'},inplace=True)

Now lets look at our data again


In [18]:
data

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


Now lets clean the data and drop the nan values

In [19]:
data.dropna(inplace=True)

In [20]:
data.shape

(5572, 2)

Now for the main thing we will count the total no of words we have in all of the sms combined.This will be the main step in the classifier

In [21]:
words=[]

for i in range(len(data['Message'])):
    blob=data['Message'][i]
    words+=blob.split(" ")

In [22]:
len(words)

86961

So we find out the total number of words to be equal to 86961.
Now we want to remove those words which have special charecters in them for ease of analysis


In [23]:
for i in range(len(words)):
    if not words[i].isalpha():
        words[i]=""

In [24]:
word_dict= Counter(words)
word_dict
len(word_dict)

7847

So we have around 8k individual words . We now remove the words which had special charecters in them or were different

In [25]:
word_dict

Counter({'Go': 14,
         'until': 26,
         'jurong': 1,
         '': 19860,
         'Available': 3,
         'only': 147,
         'in': 798,
         'bugis': 4,
         'n': 137,
         'great': 75,
         'world': 17,
         'la': 2,
         'e': 71,
         'Cine': 1,
         'there': 118,
         'got': 204,
         'amore': 1,
         'Ok': 97,
         'Joking': 1,
         'wif': 26,
         'u': 692,
         'Free': 39,
         'entry': 25,
         'a': 1327,
         'wkly': 10,
         'comp': 10,
         'to': 2134,
         'win': 38,
         'FA': 4,
         'Cup': 3,
         'final': 13,
         'tkts': 4,
         'May': 16,
         'Text': 42,
         'receive': 34,
         'txt': 75,
         'apply': 15,
         'U': 299,
         'dun': 46,
         'say': 72,
         'so': 294,
         'early': 24,
         'c': 44,
         'already': 45,
         'then': 146,
         'Nah': 7,
         'I': 1466,
         'think': 113,
      

In [26]:
del word_dict[""]

Now taking the words which occur very rarely may increase the amount of noise in the data.So we take onlywe the top 3000

In [27]:
word_dict=word_dict.most_common(3000)

Now lets form the matrix where we have all the individual words as columns and the message index as rows and the values are filled by the frequency of each word corresponding to the row

In [28]:
features=[]
labels=[]

for i in range(len(data['Label'])):

    blob=data['Message'][i].split(" ")
    data1=[]
    for j in word_dict:
        data1.append(blob.count(j[0]))
    features.append(data1)
    
    
   
    
    

We now convert features into array

In [29]:
features=np.array(features)

In [30]:
features.shape

(5572, 3000)

Now lets import our output variable

In [31]:
labels=data.iloc[:,0]

As the training models work much better on numeric data we convert labels in numeric data.We change Spam to 1 and Ham to 0.

In [32]:
for i in range(len(labels)):
    if labels[i]=='ham':
        labels[i]=0
    else:
        labels[i]=1

In [33]:
labels.shape

(5572,)

In [34]:
labels=labels.values
labels=labels.astype(int)

Now we have our required output

In [35]:
labels

array([0, 0, 1, ..., 0, 0, 0])

Now we perform train test split on our input and output

In [36]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(features,labels,test_size=0.2)

In [37]:
xtrain.shape

(4457, 3000)

Now lets check it using logistic regression if it works best in these situations

In [45]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(xtrain,ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [46]:
pred = model.predict(xtest)

In [47]:
accuracy_score(ytest,pred)

0.9659192825112107