# Building Spam / Ham Classifier

### Loading csv file

In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv("spam.csv", encoding = "latin-1")
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Lowercase "text" column

In [3]:
def lowercase_msg(msg):
    msg = msg.lower()
    return msg

df['text'] = df['text'].apply(lowercase_msg)

print(df)

     label                                               text
0      ham  go until jurong point, crazy.. available only ...
1      ham                      ok lar... joking wif u oni...
2     spam  free entry in 2 a wkly comp to win fa cup fina...
3      ham  u dun say so early hor... u c already then say...
4      ham  nah i don't think he goes to usf, he lives aro...
...    ...                                                ...
5164  spam  this is the 2nd time we have tried 2 contact u...
5165   ham             will ã_ b going to esplanade fr home?
5166   ham  pity, * was in mood for that. so...any other s...
5167   ham  the guy did some bitching but i acted like i'd...
5168   ham                         rofl. its true to its name

[5169 rows x 2 columns]


### Importing libraries

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### Split into test and train dataset

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size = 0.1, random_state = 1)

### Training the model

In [6]:
# training vectorizer
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)

In [7]:
svm = svm.SVC(C=1000)
svm.fit(X_train_vec, y_train)

SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

### Confusion Matrix

In [8]:
# testing against testing set 
X_test = vectorizer.transform(X_test)
y_pred = svm.predict(X_test)

In [9]:
confusion_matrix(y_test, y_pred)

array([[462,   0],
       [  8,  47]])

### Accuracy Score

In [10]:
print("Accuracy Score: " + str(accuracy_score(y_test,y_pred)))

Accuracy Score: 0.9845261121856866
