### P087 - 邮件文本分类 - 加载文本数据

In [3]:
import numpy as np
import pandas as pd

In [4]:
data_train = pd.read_csv("email_data_train.csv")
target_train = pd.read_csv("email_target_train.csv")

In [5]:
data_train.columns

Index(['text'], dtype='object')

In [6]:
data_train["text"][1]

"From: stephens@geod.emr.ca (Dave Stephenson)\r\nSubject: Re: Clementine Science Team Selected\r\nNntp-Posting-Host: ngis.geod.emr.ca\r\nOrganization: Dept. of Energy, Mines, and Resources, Ottawa\r\nLines: 32\r\n\r\nnickh@CS.CMU.EDU (Nick Haines) writes:\r\n\r\n>In article <stephens.734792933@ngis> stephens@geod.emr.ca (Dave Stephenson) writes:\r\n\r\n>   Remember the first government scientist in the British Empire was\r\n>   the Astronomer Royal, who was paid [...] from the Department\r\n>   of Ordinance Budget (i.e. the military). Flamsteed House (the original\r\n>   RGO) was built out of Army Surplus Scrap ( A gate house at the Tower of\r\n>   London ?), and paid for by the sale of time expired gunpowder [...]\r\n\r\n>At the time, astronomy was vital to the military, in that navigation\r\n>and cartography were of primary impoortance to the military, and good\r\n>cartography was impossible without good astronomy.\r\n\r\n>The relevance these daysis somewhat less obvious.\r\n\r\n>Nic

In [7]:
target_train.columns

Index(['target'], dtype='object')

In [8]:
target_train["target"][1]

1

### P088 - 邮件文本分类 - 输入数据转换成list

In [9]:
data_train.head()

Unnamed: 0,text
0,From: ab@nova.cc.purdue.edu (Allen B)\r\nSubje...
1,From: stephens@geod.emr.ca (Dave Stephenson)\r...
2,From: dotzlaw@ccu.umanitoba.ca (Helmut Dotzlaw...
3,"From: flb@flb.optiplan.fi (""F.Baube[tm]"")\r\nS..."
4,From: cchung@sneezy.phy.duke.edu (Charles Chun...


In [10]:
target_train.head()

Unnamed: 0,target
0,0
1,1
2,0
3,1
4,1


In [11]:
data_train = data_train["text"].tolist()

In [12]:
type(data_train)

list

In [13]:
len(data_train)

50

In [14]:
target_train = target_train["target"].tolist()

In [15]:
type(target_train)

list

In [16]:
len(target_train)

50

### P089 - 邮件文本分类 - 使用计数器向量化文本

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
vectorizer = CountVectorizer()

In [19]:
type(data_train)

list

In [20]:
data_train_vectorized = vectorizer.fit_transform(data_train)

In [21]:
type(data_train_vectorized)

scipy.sparse.csr.csr_matrix

In [23]:
data_train_vectorized.shape

(50, 3225)

In [25]:
len(vectorizer.get_feature_names())

3225

In [27]:
print(vectorizer.get_feature_names()[1000:1100])

['diving', 'division', 'dizzying', 'dkauni2', 'dmorf', 'dmorfx', 'dmorph', 'dnw', 'do', 'documentation', 'does', 'doesn', 'doing', 'dollars', 'domain', 'dominik', 'don', 'donations', 'done', 'dos', 'dotzlaw', 'double', 'douglas', 'down', 'downloaded', 'downsizing', 'downwards', 'dr', 'drawn', 'drive', 'driver', 'dropped', 'dry', 'dryden', 'dseg', 'dsp', 'dt', 'dta', 'dtax', 'dtek', 'du', 'duc', 'due', 'duke', 'dumb', 'dundee', 'dunn', 'durham', 'during', 'durned', 'dutch', 'dux', 'dwestner', 'dxf', 'dynamic', 'dynamically', 'dynamics', 'each', 'earlham', 'earlier', 'early', 'earth', 'easier', 'easily', 'easy', 'eberhart', 'economically', 'ecpdsharmony', 'ed', 'edge', 'edged', 'edges', 'edimg', 'edinburgh', 'edition', 'ednobles', 'edu', 'educate', 'education', 'educational', 'edward', 'edwards', 'efectively', 'efficiency', 'efficient', 'efforts', 'either', 'electronic', 'electronics', 'else', 'elson', 'elvis', 'email', 'embarrassed', 'emory', 'empire', 'emr', 'enclosed', 'encod', 'encod

### P090 邮件文本分类 - 计数器数据训练分类模型

In [28]:
from sklearn.naive_bayes import MultinomialNB

In [29]:
classifier = MultinomialNB()

In [30]:
classifier.fit(data_train_vectorized, target_train)

MultinomialNB()

In [31]:
docs = ['The graphic designer requires a good processor to work', 
        'Flights into space']

In [32]:
data_new = vectorizer.transform(docs)

In [33]:
data_pred = classifier.predict(data_new)

In [34]:
list(zip(docs, data_pred))

[('The graphic designer requires a good processor to work', 0),
 ('Flights into space', 1)]

### P091 邮件文本分类 - 使用TFIDF向量化文本

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
vectorizer = TfidfVectorizer()

In [37]:
data_train_vectorized = vectorizer.fit_transform(data_train)

In [38]:
data_train_vectorized.shape

(50, 3225)

In [43]:
data_train_vectorized[1].todense()

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [45]:
vectorizer.get_feature_names()[1000:1100]

['diving',
 'division',
 'dizzying',
 'dkauni2',
 'dmorf',
 'dmorfx',
 'dmorph',
 'dnw',
 'do',
 'documentation',
 'does',
 'doesn',
 'doing',
 'dollars',
 'domain',
 'dominik',
 'don',
 'donations',
 'done',
 'dos',
 'dotzlaw',
 'double',
 'douglas',
 'down',
 'downloaded',
 'downsizing',
 'downwards',
 'dr',
 'drawn',
 'drive',
 'driver',
 'dropped',
 'dry',
 'dryden',
 'dseg',
 'dsp',
 'dt',
 'dta',
 'dtax',
 'dtek',
 'du',
 'duc',
 'due',
 'duke',
 'dumb',
 'dundee',
 'dunn',
 'durham',
 'during',
 'durned',
 'dutch',
 'dux',
 'dwestner',
 'dxf',
 'dynamic',
 'dynamically',
 'dynamics',
 'each',
 'earlham',
 'earlier',
 'early',
 'earth',
 'easier',
 'easily',
 'easy',
 'eberhart',
 'economically',
 'ecpdsharmony',
 'ed',
 'edge',
 'edged',
 'edges',
 'edimg',
 'edinburgh',
 'edition',
 'ednobles',
 'edu',
 'educate',
 'education',
 'educational',
 'edward',
 'edwards',
 'efectively',
 'efficiency',
 'efficient',
 'efforts',
 'either',
 'electronic',
 'electronics',
 'else',
 'elso

### P092 邮件文本分类 - TFIDF数据训练分类模型

In [46]:
from sklearn.naive_bayes import MultinomialNB

In [47]:
classifier = MultinomialNB()

In [48]:
classifier.fit(data_train_vectorized, target_train)

MultinomialNB()

In [49]:
docs = ['The graphic designer requires a good processor to work', 
        'Flights into space']

In [50]:
data_new = vectorizer.transform(docs)

In [51]:
type(vectorizer)

sklearn.feature_extraction.text.TfidfVectorizer

In [52]:
data_pred = classifier.predict(data_new)

In [53]:
list(zip(docs, data_pred))

[('The graphic designer requires a good processor to work', 0),
 ('Flights into space', 1)]