In [54]:
import pandas as pd
import numpy as np
%config IPCompleter.greedy=True

### Requirement
We have dataset **spam.csv** which contains all email contents. This dataset is used to predict an email is ham or spam. Here is some steps need to be done
1. Read dataset and do basic analysis
2. Standardize data using CountVectorizer and TF-IDF.

In [55]:
dataset = pd.read_csv("data/spam.csv", encoding = "latin-1")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
v1            5572 non-null object
v2            5572 non-null object
Unnamed: 2    50 non-null object
Unnamed: 3    12 non-null object
Unnamed: 4    6 non-null object
dtypes: object(5)
memory usage: 217.7+ KB


In [56]:
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [57]:
dataset = dataset.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

In [58]:
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [59]:
from sklearn import preprocessing
target = dataset.v1
le = preprocessing.LabelEncoder()
encode_target = pd.DataFrame(le.fit_transform(target), columns=['result'])
encode_target.head()

Unnamed: 0,result
0,0
1,0
2,1
3,0
4,0


In [60]:
le.classes_

array(['ham', 'spam'], dtype=object)

In [61]:
text = dataset[['v2']]

In [62]:
text['v2'] = text['v2'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [63]:
text['chart_cnt'] = text['v2'].str.len()
text['word_cnt'] = text['v2'].str.split().str.len()
text['avg_word_length'] = text['chart_cnt'] / text['word_cnt']

In [64]:
text.head()

Unnamed: 0,v2,chart_cnt,word_cnt,avg_word_length
0,"go until jurong point, crazy.. available only ...",111,20,5.55
1,ok lar... joking wif u oni...,29,6,4.833333
2,free entry in 2 a wkly comp to win fa cup fina...,155,28,5.535714
3,u dun say so early hor... u c already then say...,49,11,4.454545
4,"nah i don't think he goes to usf, he lives aro...",61,13,4.692308


### We standard text data using CountVectorizer

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')
cv.fit(text['v2'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [66]:
cv.get_feature_names()

['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '084

In [67]:
text_transfom = cv.transform(text['v2'])

In [68]:
text_transfom.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [69]:
df_text = pd.DataFrame(text_transfom.toarray(), columns=cv.get_feature_names())

In [70]:
df_text.head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,ó_,û_,û_thanks,ûªm,ûªt,ûªve,ûï,ûïharry,ûò,ûówell
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
df_text.shape

(5572, 8404)

In [72]:
new_dataset = pd.concat([df_text, pd.DataFrame(encode_target)], axis= 1)

In [73]:
new_dataset.head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,û_,û_thanks,ûªm,ûªt,ûªve,ûï,ûïharry,ûò,ûówell,result
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### We standardize using TF-IDF method

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features = 1000, stop_words='english')
tv.fit(text['v2'])
tv_transform = tv.transform(text['v2'])

In [75]:
tv_transform.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [76]:
df_text = pd.DataFrame(tv_transform.toarray(), columns=tv.get_feature_names())

In [77]:
new_dataset = pd.concat([df_text, pd.DataFrame(encode_target)], axis= 1)

In [78]:
new_dataset.head()

Unnamed: 0,00,000,02,03,04,06,0800,08000839402,08000930705,0870,...,yes,yesterday,yo,yr,yup,ì_,ìï,û_,ûò,result
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


### TF-IDF với N-grams

In [79]:
tv_ngram = TfidfVectorizer(max_features = 1000, stop_words='english', ngram_range=(1,2))
tv_ngram.fit(text['v2'])
tv_ngram_tranform = tv_ngram.transform(text['v2'])

In [80]:
tv_ngram.get_feature_names()

['00',
 '000',
 '03',
 '04',
 '0800',
 '08000839402',
 '08000930705',
 '10',
 '100',
 '1000',
 '1000 cash',
 '10p',
 '10p min',
 '11',
 '12',
 '12hrs',
 '150',
 '150p',
 '150p msg',
 '150ppm',
 '16',
 '18',
 '1st',
 '1st week',
 '20',
 '200',
 '2000',
 '2000 prize',
 '2003',
 '2003 account',
 '250',
 '2lands',
 '2lands row',
 '2nd',
 '2nd attempt',
 '2nite',
 '30',
 '350',
 '350 award',
 '50',
 '500',
 '5000',
 '750',
 '800',
 '800 redeemed',
 '8007',
 '86688',
 '86688 150p',
 '87066',
 '900',
 'abiola',
 'able',
 'abt',
 'ac',
 'account',
 'account statement',
 'actually',
 'address',
 'admirer',
 'aft',
 'afternoon',
 'age',
 'ago',
 'ah',
 'aight',
 'alright',
 'amp',
 'angry',
 'ans',
 'answer',
 'anytime',
 'apply',
 'ard',
 'area',
 'asap',
 'ask',
 'askd',
 'asked',
 'asking',
 'ass',
 'attempt',
 'attempt contact',
 'auction',
 'available',
 'await',
 'await collection',
 'award',
 'awarded',
 'away',
 'awesome',
 'b4',
 'babe',
 'baby',
 'bad',
 'balance',
 'bank',
 'bath',
 '

In [81]:
tv_ngram_tranform.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [82]:
df_text = pd.DataFrame(tv_ngram_tranform.toarray(), columns=tv_ngram.get_feature_names())

In [83]:
df_text.head()

Unnamed: 0,00,000,03,04,0800,08000839402,08000930705,10,100,1000,...,years,yep,yes,yesterday,yo,yr,yup,ì_,ìï,û_
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
