#### Data source :https://archive.ics.uci.edu/ml/machine-learning-databases/00228/

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer



# TF - IDF

- TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
- IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

A document containing 100 words wherein the word cat appears 3 times. The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we have 10 million documents and the word cat appears in one thousand of these. Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4. Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.

In [3]:
tf_idf = TfidfVectorizer(ngram_range=(1,3))

In [4]:
x_traincv = tf_idf.fit_transform(["Hi How are you How are you doing","Hi what's up","Wow that's awesome"])

In [5]:
x_traincv.toarray()

array([[0.35040083, 0.35040083, 0.17520041, 0.17520041, 0.        ,
        0.17520041, 0.13324443, 0.17520041, 0.17520041, 0.        ,
        0.        , 0.35040083, 0.35040083, 0.35040083, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.35040083, 0.17520041, 0.17520041,
        0.17520041],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.32200242, 0.        , 0.        , 0.42339448,
        0.42339448, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.42339448, 0.42339448, 0.42339448, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.40824829,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.40824829,
        0.40824829, 0.        , 0.        , 0.        , 0.40824829,
      

In [7]:
tf_idf.get_feature_names()

['are',
 'are you',
 'are you doing',
 'are you how',
 'awesome',
 'doing',
 'hi',
 'hi how',
 'hi how are',
 'hi what',
 'hi what up',
 'how',
 'how are',
 'how are you',
 'that',
 'that awesome',
 'up',
 'what',
 'what up',
 'wow',
 'wow that',
 'wow that awesome',
 'you',
 'you doing',
 'you how',
 'you how are']

# Naive Bayes using TF-IDF

In [6]:
df = pd.read_csv('smsspam', sep = '\t', names = ['Status','Message'])

In [7]:
df.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
len(df)

5572

In [9]:
len(df[df.Status == 'spam'])

747

In [10]:
len(df[df.Status == 'ham'])

4825

In [11]:
df.loc[df["Status"] == 'ham', "Status",] = 1

In [12]:
df.loc[df["Status"] == 'spam', "Status",] = 0

In [13]:
df.head()

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
df_x = df["Message"]
df_y = df["Status"]

In [15]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.2, random_state = 4)

In [16]:
x_train.head()

1457    U sleeping now.. Or you going to take? Haha.. ...
472     How long has it been since you screamed, princ...
2481    Urgent! call 09066612661 from landline. Your c...
243     Okay. No no, just shining on. That was meant t...
1413    Wen ur lovable bcums angry wid u, dnt take it ...
Name: Message, dtype: object

In [17]:
cv1 = TfidfVectorizer(min_df = 1, stop_words = 'english')

In [18]:
x_traincv = cv1.fit_transform(x_train)

In [19]:
a=x_traincv.toarray()

In [20]:
a[0]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [21]:
cv1.inverse_transform(a[0])

[array(['checking', 'going', 'got', 'haha', 'lor', 'mails', 'online',
        'replying', 'sleeping', 'spys', 'wat'],
       dtype='<U27')]

In [22]:
x_train.iloc[0]

'U sleeping now.. Or you going to take? Haha.. I got spys wat.. Me online checking n replying mails lor..'

In [23]:
x_testcv = cv1.transform(x_test)

In [24]:
x_testcv.toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [25]:
classifier = MultinomialNB()

In [26]:
y_train = y_train.astype('int')

In [27]:
y_train[:5]

1457    1
472     1
2481    0
243     1
1413    1
Name: Status, dtype: int64

In [28]:
classifier.fit(x_traincv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
testmessage = [x_test.iloc[0]]

In [30]:
testmessage

['somewhere out there beneath the pale moon light someone think in of u some where out there where dreams come true... goodnite &amp; sweet dreams']

In [31]:
predictions = classifier.predict(x_testcv[0])

In [32]:
predictions

array([1])

In [33]:
y_test = y_test.astype('int')

In [34]:
print("Training accuracy {}".format(classifier.score(x_traincv, y_train)))
print("Testing accuracy {}".format(classifier.score(x_testcv, y_test)))

Training accuracy 0.982499439084586
Testing accuracy 0.957847533632287
