#### Data source :https://archive.ics.uci.edu/ml/machine-learning-databases/00228/

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer



# TF - IDF

- TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
- IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

A document containing 100 words wherein the word cat appears 3 times. The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we have 10 million documents and the word cat appears in one thousand of these. Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.

In [2]:
tf_idf = TfidfVectorizer(ngram_range = (1, 3), stop_words = 'english')

In [3]:
x_traincv = tf_idf.fit_transform(["Hi How are you How are you doing","Hi what's up","Wow that's awesome"])

In [4]:
x_traincv.toarray()

array([[0.        , 0.62276601, 0.4736296 , 0.62276601, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        ],
       [0.57735027, 0.        , 0.        , 0.        , 0.57735027,
        0.57735027]])

In [5]:
tf_idf.get_feature_names()

['awesome', 'doing', 'hi', 'hi doing', 'wow', 'wow awesome']

## Data loading & Understanding

In [6]:
dataset = pd.read_csv('data/smsspam', sep = '\t', names = ['Status','Message'])

In [7]:
dataset.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
len(dataset)

5572

In [9]:
len(dataset[dataset.Status == 'spam'])

747

In [10]:
len(dataset[dataset.Status == 'ham'])

4825

In [11]:
dataset.loc[dataset["Status"] == 'ham', "Status",] = 1

In [12]:
dataset.loc[dataset["Status"] == 'spam', "Status",] = 0

In [13]:
dataset.head()

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
X = dataset["Message"].values
y = dataset["Status"].values

In [15]:
X[:2]

array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...'], dtype=object)

## Spliting dataset into training and testing

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 4)

In [17]:
X_train[:5]

array(['U sleeping now.. Or you going to take? Haha.. I got spys wat.. Me online checking n replying mails lor..',
       'How long has it been since you screamed, princess?',
       'Urgent! call 09066612661 from landline. Your complementary 4* Tenerife Holiday or £10,000 cash await collection SAE T&Cs PO Box 3 WA14 2PX 150ppm 18+ Sender: Hol Offer',
       'Okay. No no, just shining on. That was meant to be signing, but that sounds better.',
       'Wen ur lovable bcums angry wid u, dnt take it seriously.. Coz being angry is d most childish n true way of showing deep affection, care n luv!.. kettoda manda... Have nice day da.'],
      dtype=object)

## TF - IDF

In [18]:
tf_idf = TfidfVectorizer(ngram_range = (1, 3), stop_words = 'english')

In [19]:
X_traincv = tf_idf.fit_transform(X_train)

In [20]:
X_testcv = tf_idf.transform(X_test)

In [21]:
y_train = y_train.astype('int')

In [22]:
y_test = y_test.astype('int')

# Naive Bayes

In [23]:
naive_bayes = MultinomialNB()

In [24]:
naive_bayes.fit(X_traincv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
y_test_pred = naive_bayes.predict(X_testcv)

# Accuracy

In [26]:
# Training accuracy
naive_bayes.score(X_traincv, y_train)

0.9753197217859547

In [27]:
# Testing accuracy
naive_bayes.score(X_testcv, y_test)

0.9408071748878923