In [1]:
# 引入相關套件
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
# 用pandas導入訓練集，分割符號採用table的方式，最後儲存成df1
df1 = pd.read_csv('train.csv', sep='\t') 
df1

Unnamed: 0,text,label
0,Get the latest from TODAY Sign up for our news...,1
1,2d Conan On The Funeral Trump Will Be Invited...,1
2,It’s safe to say that Instagram Stories has fa...,0
3,Much like a certain Amazon goddess with a lass...,0
4,At a time when the perfect outfit is just one ...,0
...,...,...
4982,The storybook romance of WWE stars John Cena a...,0
4983,The actor told friends he’s responsible for en...,0
4984,Sarah Hyland is getting real. The Modern Fami...,0
4985,Production has been suspended on the sixth and...,0


In [3]:
df1.columns # 看一下df1的columns有哪些

Index(['text', 'label'], dtype='object')

In [4]:
df1.index # 看一下df1的index有哪些

RangeIndex(start=0, stop=4987, step=1)

In [5]:
# 發現df1某一個label出錯
df1["label"].value_counts()

0        2972
1        2014
label       1
Name: label, dtype: int64

In [6]:
# 發現錯誤的行
df1[df1["label"] == 'label']

Unnamed: 0,text,label
1615,content,label


In [7]:
# 刪掉該行
df1 = df1.drop([1615], axis=0)

In [8]:
# 確認df1的label還有無出錯
df1["label"].value_counts()

0    2972
1    2014
Name: label, dtype: int64

In [9]:
# 看一下更新後df1的長相和長度
print(df1)
print(len(df1))

                                                   text label
0     Get the latest from TODAY Sign up for our news...     1
1     2d  Conan On The Funeral Trump Will Be Invited...     1
2     It’s safe to say that Instagram Stories has fa...     0
3     Much like a certain Amazon goddess with a lass...     0
4     At a time when the perfect outfit is just one ...     0
...                                                 ...   ...
4982  The storybook romance of WWE stars John Cena a...     0
4983  The actor told friends he’s responsible for en...     0
4984  Sarah Hyland is getting real.  The Modern Fami...     0
4985  Production has been suspended on the sixth and...     0
4986  A jury ruled against Bill Cosby in his sexual ...     0

[4986 rows x 2 columns]
4986


In [10]:
# 引入自然語言處理套件
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

## 導入文字轉換成向量套件
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
#将文本中的token轉換成詞頻矩陣
vectorizer = CountVectorizer()

In [None]:
[]

In [19]:
X = vectorizer.fit_transform(['adin rdejiocjm wssjdonwd wkmskm', 'EDDjdij adin demkm ekof', 'edkeor KOSDF.'])

In [21]:
print(vectorizer.get_feature_names())

['adin', 'demkm', 'eddjdij', 'edkeor', 'ekof', 'kosdf', 'rdejiocjm', 'wkmskm', 'wssjdonwd']


In [15]:
transformer = TfidfTransformer()

In [17]:
Y = transformer.fit_transform(X)
Y

<3x9 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [18]:
Y.toarray()

array([[0.40204024, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.52863461, 0.52863461, 0.52863461],
       [0.40204024, 0.52863461, 0.52863461, 0.        , 0.52863461,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.70710678, 0.        ,
        0.70710678, 0.        , 0.        , 0.        ]])

In [6]:
# 將每一個文本都變成小寫，避免同一個字因大寫與小寫而電腦有所區別
for i in range(len(df1)):
    df1.iloc[i, 0] = df1.iloc[i, 0].lower()

In [7]:
# 確認一下結果，發現都已變成小寫
df1

Unnamed: 0,text,label
0,get the latest from today sign up for our news...,1
1,2d conan on the funeral trump will be invited...,1
2,it’s safe to say that instagram stories has fa...,0
3,much like a certain amazon goddess with a lass...,0
4,at a time when the perfect outfit is just one ...,0
...,...,...
4982,the storybook romance of wwe stars john cena a...,0
4983,the actor told friends he’s responsible for en...,0
4984,sarah hyland is getting real. the modern fami...,0
4985,production has been suspended on the sixth and...,0


```
接下來會用兩種方式取stopwords，一種是找出在所有文章中時常出現的字詞當成stopwords；另一種是直接透過nltk裡的預設stopowords，兩種方式都會試試看！
```

## 自創stopwords

In [11]:
# 將所有文本透過word_tokenize進行斷詞，接著都放入all_words這個串列內，因為很大就先不print出來
all_words = []
for i in range(len(df1)):
    A = word_tokenize(df1.iloc[i, 0]) 
    all_words.extend(A)

In [12]:
# 觀察all_words的長度，發現一共快400萬個，但裡面有很多重複的字
len(all_words)

3988782

In [14]:
# 利用Counter計數器計算all_words中各種token的數量，並儲存成c
c = Counter(all_words)
c = dict(c)
c

{'get': 4447,
 'the': 165293,
 'latest': 662,
 'from': 12539,
 'today': 1320,
 'sign': 606,
 'up': 7611,
 'for': 32087,
 'our': 4237,
 'newsletter': 372,
 'no': 4577,
 'one': 8020,
 'ever': 1426,
 'truly': 284,
 'gets': 558,
 'over': 3978,
 'losing': 182,
 'a': 72536,
 'loved': 410,
 ',': 192906,
 'and': 86876,
 'blake': 517,
 'shelton': 486,
 'is': 27731,
 'exception': 49,
 '.': 134721,
 'he': 20027,
 'was': 29193,
 'just': 6346,
 '14': 956,
 'when': 7866,
 'his': 15407,
 'older': 335,
 'brother': 703,
 'richie': 205,
 'died': 558,
 'on': 33000,
 'nov.': 123,
 '13': 1036,
 '1990.': 11,
 'as': 19502,
 'noted': 382,
 'in': 66306,
 'tweet': 318,
 'monday': 630,
 '``': 21138,
 'it': 26635,
 'changed': 459,
 'my': 6961,
 'life': 3528,
 'forever': 236,
 "''": 17971,
 '24': 705,
 'car': 537,
 'accident': 177,
 'sheltons': 1,
 "'": 5904,
 'home': 2057,
 'state': 862,
 'of': 67880,
 'oklahoma': 71,
 'two': 4660,
 'years': 4106,
 'ago': 735,
 'sent': 386,
 'out': 7650,
 'message': 388,
 '25th':

In [15]:
# 看一下token的數量
len(c)

90058

In [16]:
# 將c裡面次數小於400的token剔除(因為我們想抓stopwords)
for i in list(c):
    if c[i] < 400:
        del c[i]

In [17]:
# 看一下更新之後的語料庫
c

{'get': 4447,
 'the': 165293,
 'latest': 662,
 'from': 12539,
 'today': 1320,
 'sign': 606,
 'up': 7611,
 'for': 32087,
 'our': 4237,
 'no': 4577,
 'one': 8020,
 'ever': 1426,
 'gets': 558,
 'over': 3978,
 'a': 72536,
 'loved': 410,
 ',': 192906,
 'and': 86876,
 'blake': 517,
 'shelton': 486,
 'is': 27731,
 '.': 134721,
 'he': 20027,
 'was': 29193,
 'just': 6346,
 '14': 956,
 'when': 7866,
 'his': 15407,
 'brother': 703,
 'died': 558,
 'on': 33000,
 '13': 1036,
 'as': 19502,
 'in': 66306,
 'monday': 630,
 '``': 21138,
 'it': 26635,
 'changed': 459,
 'my': 6961,
 'life': 3528,
 "''": 17971,
 '24': 705,
 'car': 537,
 "'": 5904,
 'home': 2057,
 'state': 862,
 'of': 67880,
 'two': 4660,
 'years': 4106,
 'ago': 735,
 'out': 7650,
 ':': 19603,
 'who': 10057,
 "'s": 24680,
 '(': 15866,
 'they': 12327,
 'shared': 1473,
 'mother': 1582,
 ')': 15930,
 'that': 40698,
 'with': 28634,
 'school': 1177,
 'city': 1395,
 'boy': 549,
 'all': 7812,
 'during': 3426,
 'or': 6065,
 'after': 8392,
 'while': 

In [18]:
# 發現更新之後token的數量，從90000多個少到1000多個
len(c)

1061

In [19]:
# 看一下更新後的token出現在各個文本中的數量，並用字典的形式表示
box = []
dict = {}
for i in c:
    num = 0
    for j in range(len(df1)):
        if i in df1.iloc[j, 0]:
            num += 1
    a = {i:num}
    dict.update(a)
print(dict)

{'get': 3180, 'the': 4898, 'latest': 543, 'from': 3231, 'today': 712, 'sign': 1251, 'up': 3992, 'for': 4545, 'our': 3839, 'no': 4514, 'one': 3748, 'ever': 3254, 'gets': 415, 'over': 2699, 'a': 4977, 'loved': 424, ',': 4801, 'and': 4764, 'blake': 135, 'shelton': 93, 'is': 4819, '.': 4918, 'he': 4938, 'was': 3767, 'just': 2658, '14': 947, 'when': 2604, 'his': 4161, 'brother': 489, 'died': 396, 'on': 4903, '13': 916, 'as': 4762, 'in': 4938, 'monday': 436, '``': 0, 'it': 4832, 'changed': 380, 'my': 2539, 'life': 1859, "''": 18, '24': 560, 'car': 2299, "'": 2737, 'home': 1146, 'state': 1416, 'of': 4705, 'two': 2170, 'years': 1715, 'ago': 849, 'out': 4052, ':': 3359, 'who': 3135, "'s": 2398, '(': 2569, 'they': 2749, 'shared': 943, 'mother': 879, ')': 2567, 'that': 4138, 'with': 4272, 'school': 481, 'city': 743, 'boy': 956, 'all': 4220, 'during': 1550, 'or': 4874, 'after': 2778, 'while': 2048, 'were': 2227, 'according': 1213, 'to': 4884, 'police': 327, 'reports': 562, 'has': 3396, 'told': 184

In [20]:
# 確認一下dict的數量有沒有錯誤
len(dict)

1061

In [21]:
# 把出現在少於400篇文本的token去掉，剩下的字當成stopwords
for i in list(dict):
    if dict[i] < 400:
        del dict[i]

In [23]:
# 看一下再度更新後token的數量，發現有只剩768個
len(dict)

768

In [25]:
# 建立stopwords，然後檢查一下stopwords的長度
stop_words1 = []
for i in dict:
    stop_words1.append(i)

len(stop_words1)

768

In [70]:
# 先把df1裡的文本都先斷詞化再放回原本的dataframe，為了避免覆蓋df1，所以選擇使用df2=df1，然後對df2進行操作
df2 = df1
for i in range(len(df2)):
    df2.iloc[i, 0] = word_tokenize(df2.iloc[i, 0])
    
df2

Unnamed: 0,text,label
0,"[newsletter, truly, losing, blake, shelton, ex...",1
1,"[2d, conan, funeral, invited, conan, tbs]",1
2,"[safe, surpassed, competitor, snapchat, popula...",0
3,"[certain, amazon, goddess, lasso, heights, pat...",0
4,"[outfit, click, demand, trendy, clothing, prob...",0
...,...,...
4982,"[storybook, romance, wwe, cena, nikki, bella, ...",0
4983,"[responsible, encouraging, brad, reignite, rom...",0
4984,"[sarah, hyland, modern, candid, hospital, gown...",0
4985,"[production, suspended, sixth, netflix, cards,...",0


In [71]:
# 將所有文本中的stopwords去除
for i in range(len(df2)):
    box = []
    for j in df2.iloc[i, 0]:
        if j not in stop_words1:
            box.append(j)
    df2.iloc[i, 0] = box 

df2

Unnamed: 0,text,label
0,"[newsletter, truly, losing, blake, shelton, ex...",1
1,"[2d, conan, funeral, invited, conan, tbs]",1
2,"[safe, surpassed, competitor, snapchat, popula...",0
3,"[certain, amazon, goddess, lasso, heights, pat...",0
4,"[outfit, click, demand, trendy, clothing, prob...",0
...,...,...
4982,"[storybook, romance, wwe, cena, nikki, bella, ...",0
4983,"[responsible, encouraging, brad, reignite, rom...",0
4984,"[sarah, hyland, modern, candid, hospital, gown...",0
4985,"[production, suspended, sixth, netflix, cards,...",0


In [72]:
Counter(df2.loc[0])

TypeError: unhashable type: 'list'

In [39]:
# 然後要將每個文本都變成str的形式，所以再做個轉換
for i in range(len(df2)):
    str0 = ' '.join(df2.iloc[i, 0])
    df2.iloc[i, 0] = str0

df2

Unnamed: 0,text,label
0,newsletter truly losing blake shelton exceptio...,1
1,2d conan funeral invited conan tbs,1
2,safe surpassed competitor snapchat popularity ...,0
3,certain amazon goddess lasso heights patty jen...,0
4,outfit click demand trendy clothing problem ea...,0
...,...,...
4982,storybook romance wwe cena nikki bella fell ap...,0
4983,responsible encouraging brad reignite romance ...,0
4984,sarah hyland modern candid hospital gown honor...,0
4985,production suspended sixth netflix cards amid ...,0


In [40]:
corpus = []
for i in range(len(df2)):
    corpus.append(df2.iloc[i, 0])

corpus

["newsletter truly losing blake shelton exception older richie died nov. 1990. shelton noted tweet `` changed forever '' richie died accident sheltons oklahoma shelton sent message 25th anniversary loss richie blake half-brother passenger collided bus ada south oklahoma richie driver redena mcmanus 3-year-old christopher mcmanus died shortly collision bus driver passengers uninjured police accident clearly remained blake 60 minutes `` remember picking picking constantly shock '' blake shelton playing halloween extravaganza oct. 31. 2011 blake then-wife miranda lambert `` '' inspired richie brothers bonded difference `` bedroom across hallway mine '' blake `` listening hank williams jr. waylon lynyrd skynyrd bob seeger whatever popular richie `` sitting 'man hero coolest '' follow randee dawn",
 '2d conan funeral invited conan tbs',
 'safe surpassed competitor snapchat popularity inception ago—and celebrities hopped trend unlike highly curated feed celebrities seem comfortable raw waiti

In [42]:
#将文本中的token轉換成詞頻矩陣
vectorizer = CountVectorizer()

In [44]:
X = vectorizer.fit_transform(corpus)
X

<4987x63904 sparse matrix of type '<class 'numpy.int64'>'
	with 761638 stored elements in Compressed Sparse Row format>

In [48]:
transformer = TfidfTransformer()

TfidfTransformer()


In [49]:
tfidf = transformer.fit_transform(X)

In [55]:
A = tfidf.toarray()

In [56]:
A.shape

(4987, 63904)

In [58]:
df3 = pd.DataFrame(A)
df3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,63894,63895,63896,63897,63898,63899,63900,63901,63902,63903
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4982,0.0,0.007038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4983,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4984,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4985,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
stop_words = set(stopwords.words('english')) 
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [23]:
len(stop_words)

179