#### SMS spam detector!

In [123]:
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.model_selection import GridSearchCV
import pickle
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

In [None]:
# reding the data set!
path = 'spam.csv'
df = pd.read_csv(path)
df.head(5)

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [125]:
# working with the label column
df['Label'].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [126]:
# label encoding on the label
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])

df = df.drop_duplicates(keep='first')

In [127]:
df['Label'].value_counts()

Label
0    4516
1     653
Name: count, dtype: int64

##### cleaning the emailText column!

In [128]:
df['EmailText']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568               Will Ã_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: EmailText, Length: 5169, dtype: object

In [129]:
# Function to extract important features
def get_importnat_feature(sent):
    sent = sent.lower()
    tokens = re.findall(r'\b\w+\b', sent)
    return tokens

In [130]:
df['imp_features'] = df['EmailText'].apply(get_importnat_feature) # function calling 

In [131]:
df['imp_features']

0       [go, until, jurong, point, crazy, available, o...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, in, 2, a, wkly, comp, to, win, f...
3       [u, dun, say, so, early, hor, u, c, already, t...
4       [nah, i, don, t, think, he, goes, to, usf, he,...
                              ...                        
5567    [this, is, the, 2nd, time, we, have, tried, 2,...
5568      [will, ã, _, b, going, to, esplanade, fr, home]
5569    [pity, was, in, mood, for, that, so, any, othe...
5570    [the, guy, did, some, bitching, but, i, acted,...
5571                     [rofl, its, true, to, its, name]
Name: imp_features, Length: 5169, dtype: object

In [132]:
# Function to remove stopwords and punctuation
def removing_stopWords(sent):
    STOPWORDS = set([
        'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
        'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 
        'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 
        'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 
        'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
        'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 
        'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 
        'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 
        'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 
        'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 
        'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 
        'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'
    ])
    return [word for word in sent if word not in STOPWORDS and word not in string.punctuation]

In [133]:
df['imp_features'] = df['imp_features'].apply(removing_stopWords)

In [134]:
# Function to perform stemming
def potter_stem(sent):
    stemmed = [re.sub(r'(ing|ed|s)$', '', word) for word in sent]
    return " ".join(stemmed)

In [135]:
df['imp_features'] = df['imp_features'].apply(potter_stem)

In [136]:
df['imp_features']

0       go jurong point crazy available bugi n great w...
1                                    ok lar jok wif u oni
2       free entry 2 wkly comp win fa cup final tkt 21...
3                     u dun say early hor u c already say
4                    nah think goe usf live around though
                              ...                        
5567    2nd time tri 2 contact u u won ã â 750 pound p...
5568                             ã b go esplanade fr home
5569                                 pity mood suggestion
5570    guy bitch act like d interest buy someth else ...
5571                                       rofl true name
Name: imp_features, Length: 5169, dtype: object

In [137]:
df

Unnamed: 0,Label,EmailText,imp_features
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugi n great w...
1,0,Ok lar... Joking wif u oni...,ok lar jok wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,2nd time tri 2 contact u u won ã â 750 pound p...
5568,0,Will Ã_ b going to esplanade fr home?,ã b go esplanade fr home
5569,0,"Pity, * was in mood for that. So...any other s...",pity mood suggestion
5570,0,The guy did some bitching but I acted like i'd...,guy bitch act like d interest buy someth else ...


In [138]:
X = df['imp_features']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [139]:
tfidf = TfidfVectorizer()
X_train_features = tfidf.fit_transform(X_train)  
X_test_features = tfidf.transform(X_test)

In [140]:
model = svm.SVC()
model.fit(X_train_features, y_train)

In [142]:
# Save the model
pickle.dump(model, open('finalized_model.sav', 'wb'))
pickle.dump(tfidf, open('tfidf_vectorizer.pkl', 'wb'))

In [143]:
# predict the score
y_pred = model.predict(X_test_features)
print("Accuracy:", model.score(X_test_features, y_test))

Accuracy: 0.9816247582205029
