# 第8章 機械学習の適用1 - 感情分析



In [5]:
# Obtain the movie dataset from tar.gz archive file
import tarfile

with tarfile.open('datasets/aclImdb_v1.tar.gz', 'r:gz') as tar:
    tar.extractall()
    print('Extracted')

Extracted


In [37]:
# Parse data and put into dataframe
import pyprind
import pandas as pd 
import os

basepath = 'aclimdb'
labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)

df = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()



0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:11


In [10]:
# Set column labels
df.columns = ['review', 'sentiment']
df.head()

Unnamed: 0,review,sentiment
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1


In [36]:
import numpy as np
np.random.seed(0)


# np.random.permutation: コピーされた配列を返す。   
# https://kaisk.hatenadiary.com/entry/2014/10/30/170522

# df.index = RangeIndex(start=0, stop=50000, step=1)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('datasets/movie_data.csv', index=False, encoding='utf-8')



In [35]:
# Test pd.DataFrame
import numpy as np 
df_temp = pd.DataFrame([[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]])
df_temp.reindex(np.random.permutation(df_temp.index))


Unnamed: 0,0,1,2,3,4
1,5,4,3,2,1
0,1,2,3,4,5


## BoW モデル

1. 文書の集合全体から、例えば単語という一意なトークンからなる語彙を作成する
2. 各文書での各単語の出現回数を含んだ特徴ベクトルを構築する


In [38]:
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer() 
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet, and one and one is two'
])
bag = count.fit_transform(docs)
print(count.vocabulary_)
print(bag.toarray())


{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [39]:
import numpy as np 
np.set_printoptions(precision=2)

In [41]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)

print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [42]:
tf_is = 3
n_docs = 3
idf_is = np.log((n_docs + 1) / (3+1))


In [4]:


import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def tokenizer_porter(text): 
    porter = PorterStemmer()
    return [porter.stem(word) for word in text.split()]

nltk.download('stopwords')
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likse running and runs a lot')[-10:] if w not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/s12723/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'liks', 'run', 'run', 'lot']