In [178]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
import pandas as pd



from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score , classification_report  ,confusion_matrix

In [150]:
stopwords  = list(STOP_WORDS)
punct = string.punctuation


In [151]:
nlp = spacy.load('en')

In [152]:
df = pd.read_csv('spam.csv')

In [153]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [154]:
df.shape

(5572, 2)

In [155]:
#check dataset

df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [156]:
spam = df[df['Category']=='spam']

In [157]:
spam.shape

(747, 2)

In [158]:
ham = df[df['Category']=='ham'].iloc[:spam.shape[0]]

In [159]:
ham.shape

(747, 2)

In [160]:
ham

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...
...,...,...
883,ham,I love to give massages. I use lots of baby oi...
884,ham,Dude we should go sup again
885,ham,Yoyyooo u know how to change permissions for a...
886,ham,Gibbs unsold.mike hussey


In [161]:
#now create new dataframe


df2 = spam.append(ham)

In [162]:
df2

Unnamed: 0,Category,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
883,ham,I love to give massages. I use lots of baby oi...
884,ham,Dude we should go sup again
885,ham,Yoyyooo u know how to change permissions for a...
886,ham,Gibbs unsold.mike hussey


## Now train test split the data

In [163]:
X = df['Message']
y = df['Category']

In [164]:
X_train  , X_test , y_train , y_test = train_test_split(X , y  , test_size=0.3 , random_state=42 , shuffle=True)

In [165]:
X_train.head() , X_train.shape

(708     Quite late lar... Ard 12 anyway i wun b drivin...
 4338                        on a Tuesday night r u 4 real
 5029    Go chase after her and run her over while she'...
 4921     G says you never answer your texts, confirm/deny
 2592         Still work going on:)it is very small house.
 Name: Message, dtype: object, (3900,))

In [166]:
X_test.head() , X_test.shape

(3245    Squeeeeeze!! This is christmas hug.. If u lik ...
 944     And also I've sorta blown him off a couple tim...
 1044    Mmm thats better now i got a roast down me! i...
 2484        Mm have some kanji dont eat anything heavy ok
 812     So there's a ring that comes with the guys cos...
 Name: Message, dtype: object, (1672,))

# Now change the data in to tokens

In [171]:
def change_into_tokens(text):
    doc = nlp(text)
    tokens = [] #to add the tokens
    cleaned_text =[] #add cleaned_text after adding tokens
    
    for token in doc:
        if token.lemma_ != '-PRON-': #the lemma is pronoun will be PRON , so to remove it
            temp = token.lemma_.lower().strip() #strip() removes whitespaces
        else:
            temp = token.lower_ #.lower_ converts the pronoun 'I' -> 'i'
        tokens.append(temp) #append the resutlt to tokens
        
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_text.append(token)
            
    return cleaned_text

In [172]:
#check our function

change_into_tokens('Hey how are you boy?')

['hey', 'boy']

In [173]:
tfidf = TfidfVectorizer(tokenizer=change_into_tokens)

In [174]:
clf = LogisticRegression()

In [175]:
#now create a pipeline

clf = Pipeline([('tfidf' , tfidf) , ('clf' , clf)])

In [176]:
clf.fit(X_train  ,y_train)



Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...',
                                 tokenizer=<function change_into_tokens at 0x000001FA627CB168>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_

In [177]:
#predict the model

y_pred = clf.predict(X_test)

In [179]:
#test accuracy

print(accuracy_score(y_pred , y_test))

0.9659090909090909


In [180]:
#check precison and recall

print(classification_report(y_pred , y_test))

              precision    recall  f1-score   support

         ham       1.00      0.96      0.98      1499
        spam       0.76      0.98      0.86       173

    accuracy                           0.97      1672
   macro avg       0.88      0.97      0.92      1672
weighted avg       0.97      0.97      0.97      1672



In [181]:
#now compute confusion matrix

confusion_matrix(y_pred , y_test)

array([[1445,   54],
       [   3,  170]], dtype=int64)

- 3 wrongly classified as true whereas they are false , 54 wrong classified as false, whereas they are true
- tn ,fn
- fp , tp