In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

**Importing the data **

In [2]:
data= pd.read_csv('../input/sms-spam-collection-dataset/spam.csv',encoding='ISO-8859-1')

In [3]:
data.head()

In [4]:
data.columns

We can see that there are there unnecessary columns in the data set.We can drop the unwanted columns 

In [5]:
data.drop(columns=['Unnamed: 2', 'Unnamed: 3','Unnamed: 4'],inplace=True)

In [7]:
data.head()

Lets change the names of columns for convienence

In [8]:
data.rename({'v1': 'labels', 'v2': 'messages'}, axis=1, inplace=True)

In [9]:
data.head()

In [10]:
data.describe()

In [11]:
data.info()

In [12]:
data.groupby('labels').describe().T

In [13]:
data.isnull().sum()

In [14]:
len(data)

In [15]:
data['length']=data['messages'].apply(len)

In [16]:
data.head()

In [17]:
data['labels'].unique()

In [18]:
data['labels'].value_counts()

**Plotting the histogram of Labels **

In [19]:
import matplotlib.pyplot as plt 
import seaborn as sns
#plt.style.use('fivethirtyeight')

In [20]:
data['length'].plot(bins=50,kind='hist')
plt.ioff()

In [21]:
plt.xscale('log')
bins=1.15**(np.arange(0,50))
plt.hist(data[data['labels']=='ham']['length'],bins=bins,alpha=0.8)
plt.hist(data[data['labels']=='spam']['length'],bins=bins,alpha=0.8)
plt.legend('ham','spam')
plt.show()

Spam text messages are longer than ham text messages 

In [22]:
data.hist(column='length',by='labels',bins=50,figsize=(10,4))
plt.ioff()

**Lets print out longest message**

In [23]:
data['length'].describe()

In [24]:
data[data['length']==910]['messages'].iloc[0]

**Email Classification based on length of Mail**

In [25]:
from sklearn.model_selection import train_test_split

**Creating the matrix features and target **

In [26]:
X=data['length'].values[:,None]
#X=data['length'].values
y=data['labels']

**Splitting the data **

In [27]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [28]:
X_train.shape

In [29]:
#y_test

**Using Logistic Regression**

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
lr_model=LogisticRegression(solver='lbfgs')

In [32]:
lr_model.fit(X_train,y_train)

In [33]:
from sklearn import metrics

In [34]:
predictions=lr_model.predict(X_test)

In [35]:
predictions

In [36]:
#y_test

In [37]:
print(metrics.confusion_matrix(y_test,predictions))

In [38]:
df=pd.DataFrame(metrics.confusion_matrix(y_test,predictions),index=['ham','spam'],columns=['ham','spam'])
df

In [39]:
print(metrics.classification_report(y_test,predictions))

In [40]:
print(metrics.accuracy_score(y_test,predictions))

**Using Naive Bayes **

In [41]:
from sklearn.naive_bayes import MultinomialNB
nb_model=MultinomialNB()
nb_model.fit(X_train,y_train)
predictions=nb_model.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))

In [42]:
print(metrics.classification_report(y_test,predictions))

**Lets try Support Vector Machine **

In [43]:
from sklearn.svm import SVC
svc_model=SVC(gamma='auto')
svc_model.fit(X_train,y_train)
predictions=svc_model.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))

In [44]:
print(metrics.classification_report(y_test,predictions))

**Extracting the features from text **

In [45]:
data.head()

#### Check for missing values 

In [46]:
data.isnull().sum()

In [47]:
data['labels'].value_counts()

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
X=data['messages']

In [50]:
y=data['labels']

In [51]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42)

In [52]:
from sklearn.feature_extraction.text import CountVectorizer 

In [53]:
count_vect=CountVectorizer()

In [54]:
# FIT Vectorizer to the data (build a vocab,count the number of words)
#count_vect.fit(X_train)
# Transform the original text to message --> Vector 
#X_train_counts=count_vect.transform(X_train)

X_train_counts=count_vect.fit_transform(X_train) # One step Fit and Transform

In [55]:
X_train_counts

In [56]:
X_train.shape

In [57]:
X_train_counts.shape

In [58]:
from sklearn.feature_extraction.text import TfidfTransformer 

In [59]:
tfidf_transformer=TfidfTransformer()

In [60]:
X_train_tfidf=tfidf_transformer.fit_transform(X_train_counts)

In [61]:
X_train_tfidf.shape

**Combining the Count Vectorization and Tdidf Transformation **

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [65]:
vectorizer=TfidfVectorizer()

In [66]:
X_train_tfidf=vectorizer.fit_transform(X_train)

**Training a classifier **

In [67]:
from sklearn.svm import LinearSVC

In [68]:
clf=LinearSVC()

In [69]:
clf.fit(X_train_tfidf,y_train)

**Creating a single pipeline tfidf,Vectorizer and Classification**

In [70]:
from sklearn.pipeline import Pipeline

In [71]:
text_clf=Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [72]:
text_clf.fit(X_train,y_train)

In [73]:
predictions=text_clf.predict(X_test)

![](http://)**Confusion Matrix & CLassification report**

In [74]:
from sklearn.metrics import confusion_matrix,classification_report

In [75]:
print(confusion_matrix(y_test,predictions))

In [76]:
print(classification_report(y_test,predictions))

**Accuracy **

In [77]:
from sklearn import metrics 

In [78]:
metrics.accuracy_score(y_test,predictions)

**Predicting on new dataset **

In [81]:
text_clf.predict(["Hello, nice to meet you"])

In [82]:
text_clf.predict(["It is better to love wisely"])