In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('mail_data.csv')

In [6]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [5]:
df.shape

(5572, 2)

# label encoding

In [8]:
df.loc[df['Category'] == 'spam','Category',] = 0
df.loc[df['Category'] == 'ham','Category',] = 1

In [9]:
df

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


# Splitting the dataset

In [27]:
x = df['Message']
y = df['Category']

In [28]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [29]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [30]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [31]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5572,)
(3900,)
(1672,)


# Feature Extraction

**What is a TF-IDF vectorizer?**
- TF-IDF will transform the text into meaningful representation of integers or numbers which is used to fit machine learning algorithm for predictions. TF-IDF Vectorizer is a measure of originality of a word by comparing the number of times a word appears in document with the number of documents the word appears in

**What is the difference between TF-IDF vectorizer and TF-IDF transformer?**
- Tfidftransformer and Tfidfvectorizer aim to do the same thing, which is to convert a collection of raw documents to a matrix of TF-IDF features. The only difference is that with Tfidftransformer, you will systematically compute the word counts, generate idf values and then compute a tfidf score or set of scores.

In [38]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english')

In [39]:
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

In [40]:
# convert Y_train and Y_test values as integers

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [41]:
print(x_train)

708     Quite late lar... Ard 12 anyway i wun b drivin...
4338                        on a Tuesday night r u 4 real
5029    Go chase after her and run her over while she'...
4921     G says you never answer your texts, confirm/deny
2592         Still work going on:)it is very small house.
                              ...                        
3772    Hi, wlcome back, did wonder if you got eaten b...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860               Did he just say somebody is named tampa
Name: Message, Length: 3900, dtype: object


In [42]:
print(x_train_features)

  (0, 2221)	0.4595576461719437
  (0, 6907)	0.42057032296489166
  (0, 260)	0.3977642008852706
  (0, 943)	0.35285068394560465
  (0, 3629)	0.33392519134134463
  (0, 3634)	0.31674844532481095
  (0, 5025)	0.3425956765507876
  (1, 5087)	0.564752315585174
  (1, 4329)	0.4556948018194525
  (1, 6399)	0.6880385669683886
  (2, 5909)	0.4598907748288949
  (2, 1882)	0.5577039025555312
  (2, 5320)	0.407962240134966
  (2, 1557)	0.5577039025555312
  (3, 2032)	0.5517950786382567
  (3, 1762)	0.4550182724182232
  (3, 6157)	0.39464721758363247
  (3, 889)	0.4179976347325351
  (3, 5392)	0.3975073759914889
  (4, 3168)	0.517613431846439
  (4, 5675)	0.6128580677760653
  (4, 2864)	0.40046755880208973
  (4, 6864)	0.44283976592107693
  (5, 6369)	0.3834923635990344
  (5, 5884)	0.4325022650654002
  :	:
  (3895, 6835)	0.4724310616139888
  (3895, 6851)	0.3960620484148489
  (3895, 2277)	0.4500945571447008
  (3895, 2080)	0.26990881063254835
  (3895, 2891)	0.2286724629283105
  (3895, 3085)	0.26272736081331977
  (3896, 575

# Logistic Regression

In [43]:
model = LogisticRegression()

In [44]:
model.fit(x_train_features, y_train)

In [45]:
y_pred = model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, y_pred)

In [46]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9659090909090909


In [None]:
# 