# Importing libraries

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Reading Data

### Converting txt data i.e. Tabs seperated into Data Frame

In [61]:
# Opening Data File and saving entries by spliting them wheneve a tab is found 
with open('train.txt','r') as f:
    txt=[x.strip().split('\t') for x in f]

In [62]:
# Converting into  Pandas DataFrame
train = pd.DataFrame(txt, columns = ['label', 'text'])

In [63]:
train.tail()

Unnamed: 0,label,text
4996,ham,"Just looked it up and addie goes back Monday, ..."
4997,ham,Happy new year. Hope you are having a good sem...
4998,ham,Esplanade lor. Where else...
4999,ham,Can you talk with me..
5000,,


In [64]:
# Dropping the last Row as No value is present
train.drop(5000,0,inplace = True)

In [65]:
train.label = train.label.astype('category').cat.codes

In [66]:
train.label.value_counts()
# 0 represents ham and 1 represent spam 

ham     4327
spam     673
Name: label, dtype: int64

In [67]:
# Doing the same with Test File converiting it into Pandas Data Frame
with open('test.txt','r') as f:
    txt=[x.strip().split('\t') for x in f]

In [68]:
test = pd.DataFrame(txt, columns = ['label', 'text'])

In [71]:
test.head()

Unnamed: 0,label,text,label_cod
0,ham,"Hmph. Go head, big baller.",0
1,ham,Well its not like you actually called someone ...,0
2,ham,"Nope. Since ayo travelled, he has forgotten hi...",0
3,ham,You still around? Looking to pick up later,0
4,spam,CDs 4u: Congratulations ur awarded £500 of CD ...,1


In [72]:
test.label = test.label.astype('category').cat.codes

### Transforming data in form of x and y

In [73]:
x_train = train['text']
y_train = train['label']

x_test = test['text']
y_test = test['label']

# Text transformation

In [51]:
# Transforming Text Using Count Vecotorizer that transform the text data into the count of a word occuring
# as computers are good with numbers
from sklearn.feature_extraction.text import CountVectorizer

In [52]:
model = CountVectorizer()

In [74]:
# Fitting the data into the model
model.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [75]:
x_train_mod = model.transform(x_train)
x_test_mod = model.transform(x_test)

In [83]:
print(model.get_feature_names()[0:20])

['00', '000', '000pes', '008704050406', '0089', '01223585236', '01223585334', '0125698789', '02', '0207', '02072069400', '02073162414', '02085076972', '021', '03', '04', '0430', '05', '050703', '0578']


### Machine Learning Model

#### Using Logistic Regression Model

In [77]:
# as this is a classification problem theat could be done using Logistic Regression
from sklearn.linear_model import LogisticRegression

In [78]:
ml_model = LogisticRegression()

In [84]:
ml_model.fit(x_train_mod,train['label_cod'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [86]:
# importing Package to measure the accuracy
from sklearn.metrics import accuracy_score

In [87]:
predictions = ml_model.predict(x_test_mod)

In [88]:
accuracy_score(test['label_cod'], predictions)

0.98666666666666669

#### Using Multinomial Naive Bayes

In [89]:
# the best technique for solving text data and classification we can use Multinomial Naive bayes
# as it is good for this purpose
from sklearn.naive_bayes import MultinomialNB

In [90]:
ml_model_1 = MultinomialNB()

In [91]:
ml_model_1.fit(x_train_mod, train['label_cod'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [92]:
predictions_1 = ml_model.predict(x_test_mod)

In [93]:
accuracy_score(test['label_cod'], predictions)

0.98666666666666669