**Задача:**

По данному дата сету построить модель, которая будет классифицировать принадлежность сообщения к спаму или не спаму.

In [6]:
import pandas as pd
import numpy as np

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold



In [9]:
from textblob import TextBlob

In [10]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
STOPWORDS = set(stopwords.words('english'))

## Считаем данные

In [12]:
df = pd.read_csv('07 - SMS.csv', encoding='latin-1')

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,1,Urgent! call 09061749602 from Landline. Your c...,spam
1,2,+449071512431 URGENT! This is the 2nd attempt ...,spam
2,3,FREE for 1st week! No1 Nokia tone 4 ur mob eve...,spam
3,4,Urgent! call 09066612661 from landline. Your c...,spam
4,5,WINNER!! As a valued network customer you have...,spam


In [14]:
df = df.drop('Unnamed: 0', axis=1)

In [15]:
df.head()

Unnamed: 0,text,label
0,Urgent! call 09061749602 from Landline. Your c...,spam
1,+449071512431 URGENT! This is the 2nd attempt ...,spam
2,FREE for 1st week! No1 Nokia tone 4 ur mob eve...,spam
3,Urgent! call 09066612661 from landline. Your c...,spam
4,WINNER!! As a valued network customer you have...,spam


In [16]:
len(df)

1324

In [17]:
len(df[df.label == 'spam'])

322

In [18]:
df['label'].value_counts()

not spam    1002
spam         322
Name: label, dtype: int64

In [19]:
df.label.unique()

array(['spam', 'not spam'], dtype=object)

## Проведем некоторый анализ

In [20]:
spam = df[df.label == 'spam']
ham = df[df.label == 'not spam']

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')

In [22]:
# corpus_spam = []
# for message in spam['text']:
#     corpus_spam.append(message)
corpus_spam = spam['text'].tolist()
tfidf_matrix_spam =  tf.fit_transform(corpus_spam)
feature_names_spam = np.array(tf.get_feature_names())

In [23]:
len(feature_names_spam)

7551

In [24]:
# corpus_ham = []
# for message in ham['text']:
#     corpus_ham.append(message)
corpus_ham = ham['text'].tolist()
tfidf_matrix_ham =  tf.fit_transform(corpus_ham)
feature_names_ham = np.array(tf.get_feature_names())

In [25]:
len(feature_names_ham)

12604

In [26]:
feature_names_ham

array(['000pes', '000pes 48', '000pes 48 tb', ..., 'zoom cine',
       'zoom cine actually', 'èn'], dtype='<U36')

In [27]:
def tfidf_print(tfidf_matrix, feature_names):
    message_id = 0
    for message in tfidf_matrix.todense():
        print ("Message " + str(message_id))
        word_id = 0
        for score in message.tolist()[0]:
            if score > 0:
                word = feature_names[word_id]
                print(word + " " + str(score))
            word_id +=1
        message_id +=1

In [28]:
tfidf_print(tfidf_matrix_ham, feature_names_ham)

Message 0
bedbut 0.15585878907526726
bedbut mite 0.15585878907526726
bedbut mite thepub 0.15585878907526726
bellearlier 0.15585878907526726
bellearlier hunny 0.15585878907526726
bellearlier hunny just 0.15585878907526726
didntgive 0.15585878907526726
didntgive ya 0.15585878907526726
didntgive ya bellearlier 0.15585878907526726
hey 0.08634955643562726
hey sorry 0.15585878907526726
hey sorry didntgive 0.15585878907526726
hunny 0.12880630325188636
hunny just 0.15585878907526726
hunny just bedbut 0.15585878907526726
jenxxx 0.15585878907526726
just 0.104078021679787
just bedbut 0.15585878907526726
just bedbut mite 0.15585878907526726
l8tr 0.14710307122215585
l8tr uwana 0.15585878907526726
l8tr uwana mt 0.15585878907526726
loads 0.14089079054148582
loads luv 0.15585878907526726
loads luv jenxxx 0.15585878907526726
luv 0.1138383047181049
luv jenxxx 0.15585878907526726
mite 0.1321350726883744
mite thepub 0.15585878907526726
mite thepub l8tr 0.15585878907526726
mt 0.14089079054148582
mt loads 0

tired 0.1454633518586305
tired got 0.1541214720206398
tired got pist 0.1541214720206398
txt 0.11771904544089343
txt ring 0.1541214720206398
txt ring meet 0.1541214720206398
Message 41
2moro 0.2201099845221679
2nite 0.20643115630549078
2nite 2moro 0.24349409581103426
im 0.13998982819309355
im realy 0.24349409581103426
im realy soz 0.24349409581103426
imat 0.24349409581103426
imat mums 0.24349409581103426
imat mums 2nite 0.24349409581103426
mums 0.24349409581103426
mums 2nite 0.24349409581103426
mums 2nite 2moro 0.24349409581103426
realy 0.24349409581103426
realy soz 0.24349409581103426
realy soz imat 0.24349409581103426
soz 0.24349409581103426
soz imat 0.24349409581103426
soz imat mums 0.24349409581103426
Message 42
come 0.10152146658873641
come tb 0.1823225620174173
come tb love 0.1823225620174173
darlin 0.12979417031470572
darlin im 0.15917630976039643
darlin im helens 0.1823225620174173
fone 0.14432833853699437
fone im 0.1823225620174173
fone im gonna 0.1823225620174173
gonna 0.16481

luv 0.1006860925000025
luv jaz 0.13785177575009114
nice 0.09697783241547836
nice wkend 0.13785177575009114
nice wkend im 0.13785177575009114
nitw 0.13785177575009114
nitw wen 0.13785177575009114
nitw wen texd 0.13785177575009114
sure 0.11137440877516752
sure did 0.13785177575009114
sure did lookin 0.13785177575009114
texd 0.13785177575009114
texd hopeu 0.13785177575009114
texd hopeu ad 0.13785177575009114
wen 0.1168689588635478
wen texd 0.13785177575009114
wen texd hopeu 0.13785177575009114
wkend 0.13785177575009114
wkend im 0.13785177575009114
wkend im sure 0.13785177575009114
Message 79
4eva 0.21656934928937702
baby 0.3449934361938538
baby 4eva 0.22945976756722544
baby ur 0.22945976756722544
baby ur baby 0.22945976756722544
kiss 0.22945976756722544
kiss miss 0.22945976756722544
kiss miss baby 0.22945976756722544
miss 0.2074234519710004
miss baby 0.22945976756722544
miss baby ur 0.22945976756722544
need 0.14791571009346227
need kiss 0.22945976756722544
need kiss miss 0.229459767567225

2nite 0.10862124322843912
2nite tell 0.12812325367513197
2nite tell every1 0.12812325367513197
ava 0.12812325367513197
ava goodtime 0.12812325367513197
ava goodtime oli 0.12812325367513197
comin 0.10351445638457216
comin 2nite 0.12812325367513197
comin 2nite tell 0.12812325367513197
every1 0.12812325367513197
every1 im 0.12812325367513197
every1 im sorry 0.12812325367513197
everythin 0.12812325367513197
everythin mon 0.12812325367513197
everythin mon l8rs 0.12812325367513197
explain 0.12812325367513197
explain everythin 0.12812325367513197
explain everythin mon 0.12812325367513197
goodtime 0.12812325367513197
goodtime oli 0.12812325367513197
goodtime oli rang 0.12812325367513197
hi 0.07539395641888383
hi hun 0.12812325367513197
hi hun im 0.12812325367513197
hope 0.08638282777393358
hope ava 0.12812325367513197
hope ava goodtime 0.12812325367513197
hun 0.11185772341524246
hun im 0.12812325367513197
hun im comin 0.12812325367513197
ifink 0.12812325367513197
ifink mite 0.12812325367513197

squid 0.21550329024336182
squid 10 0.21550329024336182
squid 10 30 0.21550329024336182
wanna 0.1620049157209163
wanna dosomething 0.21550329024336182
wanna dosomething late 0.21550329024336182
Message 160
adress 0.22289536674425844
beendropping 0.22289536674425844
beendropping red 0.22289536674425844
beendropping red wine 0.22289536674425844
hi 0.13116247897433395
hi missed 0.22289536674425844
hi missed mumhas 0.22289536674425844
missed 0.2103737184396765
missed mumhas 0.22289536674425844
missed mumhas beendropping 0.22289536674425844
mumhas 0.22289536674425844
mumhas beendropping 0.22289536674425844
mumhas beendropping red 0.22289536674425844
red 0.2103737184396765
red wine 0.22289536674425844
red wine theplace 0.22289536674425844
theplace 0.22289536674425844
theplace adress 0.22289536674425844
wine 0.22289536674425844
wine theplace 0.22289536674425844
wine theplace adress 0.22289536674425844
Message 161
2go 0.13861111266517734
2go thanx 0.1468613808867652
2go thanx xx 0.1468613808867

cine got amore 0.17461176043176843
crazy 0.1524444114801334
crazy available 0.17461176043176843
crazy available bugis 0.17461176043176843
got 0.08418824766203405
got amore 0.17461176043176843
got amore wat 0.17461176043176843
great 0.1524444114801334
great world 0.17461176043176843
great world la 0.17461176043176843
jurong 0.17461176043176843
jurong point 0.17461176043176843
jurong point crazy 0.17461176043176843
la 0.14803360011833955
la buffet 0.17461176043176843
la buffet cine 0.17461176043176843
point 0.16480255225527393
point crazy 0.17461176043176843
point crazy available 0.17461176043176843
wat 0.08477122729519093
world 0.17461176043176843
world la 0.17461176043176843
world la buffet 0.17461176043176843
Message 204
dunno 0.22138872278720007
dunno lets 0.3569310763080414
dunno lets learn 0.3569310763080414
learn 0.33687967070106356
learn pilates 0.3569310763080414
lets 0.33687967070106356
lets learn 0.3569310763080414
lets learn pilates 0.3569310763080414
pilates 0.29497837579601

sugardad ah gee 0.14442688547608556
time 0.07923118325100334
time reply 0.14442688547608556
time reply fast 0.14442688547608556
ur 0.0698743986392252
ur sugardad 0.14442688547608556
ur sugardad ah 0.14442688547608556
workin 0.1260915731212384
workin huh 0.14442688547608556
workin huh got 0.14442688547608556
Message 246
ah 0.09616241143591522
ah waitin 0.14534552378635432
ah waitin treat 0.14534552378635432
den 0.09369348965921107
den dun 0.1371804122450896
den dun work 0.14534552378635432
dun 0.07973513928427492
dun work 0.1371804122450896
dun work frm 0.14534552378635432
ex 0.1201178305324089
ex wat 0.14534552378635432
ex wat ah 0.14534552378635432
frm 0.1371804122450896
frm tmr 0.14534552378635432
frm tmr onwards 0.14534552378635432
gd 0.10347047266154592
gd den 0.14534552378635432
gd den dun 0.14534552378635432
liao 0.08605146318637753
liao gd 0.14534552378635432
liao gd den 0.14534552378635432
mei 0.1371804122450896
mei ex 0.14534552378635432
mei ex wat 0.14534552378635432
onwards 

la 0.11738997918437784
la ex 0.13846634079051529
la ex oso 0.13846634079051529
lor 0.05615588825794953
lor haha 0.13068768281872106
lor haha best 0.13846634079051529
mon 0.10961132121258362
mon okie 0.13846634079051529
mon okie lor 0.13846634079051529
okie 0.19262723515648078
okie depends 0.13846634079051529
okie depends wana 0.13846634079051529
okie lor 0.1208877327587912
okie lor haha 0.13846634079051529
oso 0.09079457191569139
oso okie 0.13846634079051529
oso okie depends 0.13846634079051529
prefer 0.13068768281872106
wana 0.11443267067492177
wana eat 0.13846634079051529
wana eat western 0.13846634079051529
western 0.13846634079051529
western chinese 0.13846634079051529
western chinese food 0.13846634079051529
Message 295
beehoon 0.1536867577322393
beehoon eat 0.1536867577322393
beehoon eat cheap 0.1536867577322393
cafe 0.1536867577322393
cafe tok 0.1536867577322393
cafe tok nydc 0.1536867577322393
cake 0.14505305862263607
cake fishhead 0.1536867577322393
cake fishhead beehoon 0.153

wan 0.11950988414013117
wan shun 0.218902163017549
wan shun bian 0.218902163017549
watch 0.15062290149184987
watch da 0.218902163017549
watch da glass 0.218902163017549
Message 358
browser 0.2191487653984028
browser surf 0.2191487653984028
da 0.11690725089961056
da browser 0.2191487653984028
da browser surf 0.2191487653984028
dial 0.2191487653984028
dial juz 0.2191487653984028
dial juz open 0.2191487653984028
dun 0.1202228790939832
dun need 0.20683759084693437
dun need use 0.2191487653984028
juz 0.14369931513033246
juz open 0.2191487653984028
juz open da 0.2191487653984028
need 0.14126897099951718
need use 0.2191487653984028
need use dial 0.2191487653984028
open 0.19810267349286884
open da 0.2191487653984028
open da browser 0.2191487653984028
surf 0.2191487653984028
use 0.18111100760285928
use dial 0.2191487653984028
use dial juz 0.2191487653984028
Message 359
ar 0.1711258232206943
did 0.14732823421218272
did use 0.21180807033768614
did use soc 0.21180807033768614
dunno 0.1313752746135

Message 430
come 0.2870820893146953
come come 0.25778558856524364
come come lor 0.25778558856524364
come lor 0.25778558856524364
come lor din 0.25778558856524364
din 0.20406561472544776
din stripes 0.25778558856524364
din stripes skirt 0.25778558856524364
lor 0.10454655350415072
lor din 0.25778558856524364
lor din stripes 0.25778558856524364
skirt 0.25778558856524364
stripes 0.25778558856524364
stripes skirt 0.25778558856524364
wan 0.14073833441270278
wan come 0.21854729967544143
wan come come 0.25778558856524364
Message 431
mum 0.454119082552853
mum repent 0.6299904201102406
repent 0.6299904201102406
Message 432
forgot 0.17547435564021296
forgot lect 0.22166772245132307
forgot lect saw 0.22166772245132307
got 0.10687606075481572
got lazy 0.22166772245132307
got lazy type 0.22166772245132307
lazy 0.18792703826684568
lazy type 0.22166772245132307
lazy type forgot 0.22166772245132307
lect 0.17909171933563367
lect saw 0.22166772245132307
lect saw pouch 0.22166772245132307
like 0.130440073

lor 0.4592062605335728
ok 0.47741549953433193
ok lor 0.7491355358619406
Message 503
movie 0.5879237953295854
movie wat 0.7276926547419472
wat 0.3532831882780059
Message 504
going 0.2705537136798484
going today 0.5048865481120323
going today meeting 0.5048865481120323
meeting 0.34381587942862835
today 0.3007823504814542
today meeting 0.4563994454166726
Message 505
ben 0.44377354644230493
ben going 0.44377354644230493
going 0.2519594592502865
noe 0.3170081484701602
noe ben 0.4701873794849575
noe ben going 0.4701873794849575
Message 506
abt 0.14915367991646034
abt leona 0.20137899849406793
abt leona oops 0.20137899849406793
ben 0.19006608054559912
ben going 0.19006608054559912
ben going msg 0.20137899849406793
decide 0.18203943752302798
decide lor 0.20137899849406793
decide lor abt 0.20137899849406793
dunno 0.12490657785787973
dunno lei 0.16642554778262575
dunno lei decide 0.20137899849406793
going 0.10791302739879878
going msg 0.20137899849406793
lei 0.13204739763247927
lei decide 0.2013

juz 0.18253689880517313
juz remembered 0.2783780561273043
juz remembered gotta 0.2783780561273043
remembered 0.2783780561273043
remembered gotta 0.2783780561273043
remembered gotta bathe 0.2783780561273043
today 0.16584162592077847
Message 571
lor 0.1696579913249272
lor thanx 0.3948332124333316
lor thanx school 0.41833406920254884
ok 0.17638556274094988
ok lor 0.2767750380771817
ok lor thanx 0.3948332124333316
school 0.365225634424503
thanx 0.2613748176484655
thanx school 0.41833406920254884
Message 572
care 0.3939282104294705
ok 0.21376129575830394
ok thanx 0.4298095724926873
ok thanx care 0.5069781863264715
thanx 0.31675959659568625
thanx care 0.5069781863264715
Message 573
check 0.21896453647765443
check tell 0.26495246462509287
dun 0.14535034255094897
dun haf 0.21896453647765443
dun haf enuff 0.26495246462509287
enuff 0.23950758515721118
enuff check 0.26495246462509287
enuff check tell 0.26495246462509287
haf 0.1737335258973673
haf enuff 0.2500681643010124
haf enuff check 0.2649524

going liao 0.20067814329037256
going liao lazy 0.20067814329037256
got 0.09675603284630337
got fringe 0.20067814329037256
got fringe thk 0.20067814329037256
lazy 0.17013234356540521
lazy dun 0.20067814329037256
lazy dun wan 0.20067814329037256
length 0.20067814329037256
length shorter 0.20067814329037256
length shorter got 0.20067814329037256
liao 0.1188110057317317
liao lazy 0.20067814329037256
liao lazy dun 0.20067814329037256
shorter 0.18940459745852195
shorter got 0.20067814329037256
shorter got fringe 0.20067814329037256
thk 0.11231558188605499
thk going 0.20067814329037256
thk going liao 0.20067814329037256
wan 0.10956045990357066
wan distract 0.20067814329037256
Message 644
ah 0.1405071882378402
ah failed 0.2123708272830336
ah failed quite 0.2123708272830336
cut 0.16811484171969418
cut short 0.19197565900121594
cut short leh 0.2123708272830336
dun 0.23300913646601285
dun cut 0.2123708272830336
dun cut short 0.2123708272830336
dun like 0.19197565900121594
dun like ah 0.2004404186

leh said 0.1766284360890905
leh said ding 0.1766284360890905
lor 0.07163276406069917
make 0.13982081228482754
make reservations 0.1766284360890905
make reservations said 0.1766284360890905
reservations 0.1766284360890905
reservations said 0.1766284360890905
reservations said wait 0.1766284360890905
ron 0.15966581121505222
ron say 0.1766284360890905
ron say fri 0.1766284360890905
said 0.24571637482157846
said ding 0.1766284360890905
said ding tai 0.1766284360890905
said wait 0.1766284360890905
said wait lor 0.1766284360890905
say 0.11204570278390404
say fri 0.1766284360890905
say fri leh 0.1766284360890905
tai 0.1766284360890905
tai feng 0.1766284360890905
tai feng make 0.1766284360890905
wait 0.11685957640806277
wait lor 0.15420506570506406
Message 705
believe 0.33232942242484165
believe leh 0.2056675385265033
believe leh tot 0.2056675385265033
dun 0.11282720928351231
dun believe 0.17955758989951856
dun believe leh 0.2056675385265033
leh 0.12850092625983525
leh tot 0.2056675385265033
l

eh remember spell 0.2266229870657439
make 0.17939699198810607
make wet 0.2266229870657439
naughty 0.21389193130976253
naughty make 0.2266229870657439
naughty make wet 0.2266229870657439
remember 0.2048591035000688
remember spell 0.2266229870657439
remember spell yes 0.2266229870657439
spell 0.2266229870657439
spell yes 0.2266229870657439
spell yes did 0.2266229870657439
wet 0.2266229870657439
yes 0.2266229870657439
yes did 0.2266229870657439
yes did naughty 0.2266229870657439
Message 787
finish 0.4682547138267257
finish liao 0.7603297174505255
liao 0.45015135647985705
Message 788
Message 789
10 0.3517712193481629
10 min 0.4605499300251782
10 min later 0.4605499300251782
later 0.2797646890605288
min 0.4020820984010387
min later 0.4605499300251782
Message 790
actually 0.1368327063270588
actually thk 0.16139984245003117
actually thk quite 0.16139984245003117
ask 0.0980436078559012
ask come 0.16139984245003117
ask come fetch 0.16139984245003117
come 0.08987120700482942
come fetch 0.1523328

lor 0.11502505404607082
lor expected 0.28362294368544716
lor expected having 0.28362294368544716
Message 850
car 0.15596472310563889
car going 0.2135350920931116
car going dinner 0.2135350920931116
dad 0.15596472310563889
dad car 0.2135350920931116
dad car going 0.2135350920931116
dinner 0.13881060900963174
dinner leh 0.2135350920931116
dinner leh free 0.2135350920931116
free 0.14396890927185788
free tonight 0.2135350920931116
going 0.11442711710738677
going dinner 0.20153927825933063
going dinner leh 0.2135350920931116
havent 0.14127748035809598
havent stuck 0.2135350920931116
havent stuck orchard 0.2135350920931116
leh 0.13341656792089282
leh free 0.2135350920931116
leh free tonight 0.2135350920931116
orchard 0.16309957905130848
orchard dad 0.2135350920931116
orchard dad car 0.2135350920931116
stuck 0.20153927825933063
stuck orchard 0.2135350920931116
stuck orchard dad 0.2135350920931116
tonight 0.15201415449971586
Message 851
izzit 0.541449792169829
izzit raining 0.6325219292779553


bathe 0.1081405130582613
bathe dun 0.14158095654942104
bathe dun disturb 0.14158095654942104
bathing 0.12360694116250391
bathing bathe 0.14158095654942104
bathing bathe dun 0.14158095654942104
card 0.12003051028397867
card dunno 0.14158095654942104
card dunno network 0.14158095654942104
cleaning 0.14158095654942104
cleaning ur 0.14158095654942104
cleaning ur room 0.14158095654942104
disturb 0.11700668106172761
disturb liao 0.12360694116250391
disturb liao cleaning 0.14158095654942104
dun 0.07766993434942522
dun disturb 0.12360694116250391
dun disturb liao 0.12360694116250391
dunno 0.08781646996300005
dunno network 0.14158095654942104
dunno network gd 0.14158095654942104
finish 0.08719368554297906
finish bathing 0.13362732803558916
finish bathing bathe 0.14158095654942104
gd 0.10079050329458956
gd thk 0.14158095654942104
gd thk waiting 0.14158095654942104
jus 0.0905264355047044
jus testing 0.14158095654942104
jus testing phone 0.14158095654942104
lar 0.08382266032707668
lar jus 0.141580

say change suntec 0.2796017061104825
shuhui 0.2258982486162319
shuhui say 0.26389445171387793
shuhui say change 0.2796017061104825
steamboat 0.2796017061104825
steamboat noe 0.2796017061104825
suntec 0.2527499773633572
suntec steamboat 0.2796017061104825
suntec steamboat noe 0.2796017061104825
Message 980
working 1.0
Message 981
change 0.2505413735915
change suntec 0.2861304500871734
change suntec wat 0.3031612127308954
coming 0.2214270925517308
lor 0.12294892093679681
lor change 0.3031612127308954
lor change suntec 0.3031612127308954
suntec 0.27404693169112615
suntec wat 0.3031612127308954
suntec wat time 0.3031612127308954
time 0.16631128976644186
time coming 0.2861304500871734
wat 0.14717993798326537
wat time 0.20860013830769994
wat time coming 0.2861304500871734
Message 982
dat 0.18214915322886696
dat route 0.28713925898449244
lor 0.11645111765030547
lor noe 0.28713925898449244
lor noe used 0.28713925898449244
noe 0.19359406231506454
noe used 0.28713925898449244
noe used dat 0.2871

In [29]:
response = tf.transform(corpus_ham)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
top_n = feature_names_ham[tfidf_sorting][:100]impo
top_n

array(['horrible bf', 'horrible bf hungry', 'bf hungry', 'bf', 'hungry',
       'horrible', 'got improve wat', 'got lessons sch', 'got lessons',
       'got lesson lei', 'got lesson', 'got lect 12', 'got lect',
       'got lazy type', 'got lazy', 'got improve', 'got fringe thk',
       'got home stuff', 'got home', 'got hip hop', 'got hip',
       'got got colours', 'got got', 'got gd', 'got fujitsu ibm',
       'got like', 'got like mb', 'got lo', 'got new job', 'got place',
       'got pist love', 'got pist', 'got payed2day havbeen',
       'got payed2day', 'got paper da', 'got paper', 'got pages',
       'got offer 2000', 'got offer', 'got new', 'got lo oso', 'got meh',
       'got mc', 'got lunch today', 'got lunch', 'got lots hair',
       'got lots', 'got lot work', 'got lot pple', 'got lot model',
       'got lot', 'got fujitsu', 'èn', 'got porridge', 'got card',
       'got car', 'got bus8 22', 'got bus8', 'got bout 2mrw', 'got bout',
       'got bao ur', 'got bao', 'got bak ne

In [30]:
response = tf.transform(corpus_spam)
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]

## Разделим на train/test sets

In [31]:
N = len(df)

inds = np.arange(N)
inds = np.arange(N)
np.random.seed(0)
np.random.shuffle(inds)

N1 = int(0.2*N)

train_inds = inds[:N1]
test_inds = inds[N1:]

In [32]:
train = df.loc[train_inds]
test = df.loc[test_inds]

## Модель классификации

Некоторые вспомогательные функции

In [None]:
def split_stopwords(message):
    message = message.lower()
    words = TextBlob(message).words
    return [word.lemma for word in words if word not in STOPWORDS]

In [None]:
def split_lemmas(message):
    message = message.lower()
    words = TextBlob(message).words
    return [word.lemma for word in words]

In [None]:
def split_tokens(message):
    return TextBlob(message).words

Naive Bayes Classifier

In [None]:
pipeline = Pipeline([('bow', CountVectorizer(analyzer=split_stopwords)),
                     ('tfidf', TfidfTransformer()),('classifier', MultinomialNB())])

In [None]:
params = {'tfidf__use_idf': (True, False),
        'bow__analyzer': (split_lemmas, split_tokens, split_stopwords),}

In [None]:
grid = GridSearchCV(pipeline, params, refit=True, n_jobs=-1,
        scoring='accuracy',
        cv=StratifiedKFold(train.label, n_folds=5),
        )

In [None]:
nb_detector = grid.fit(train.text, train.label)

In [None]:
predicted = nb_detector.predict(test.text)

In [None]:
confusion_matrix(test.label, predicted)

In [None]:
nb_detector.score(test.text, test.label)

In [None]:
print(metrics.classification_report(test.label, predicted,
    target_names=test.label.unique()))

Linear SVM Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
spam_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),
])

In [None]:
spam_clf.fit(train.text, train.label)

In [None]:
confusion_matrix(test.label, predicted)

In [None]:
spam_clf.score(test.text, test.label)

In [None]:
predicted = spam_clf.predict(test.text)

In [None]:
print(metrics.classification_report(test.label, predicted,
    target_names=test.label.unique()))