In [1]:
'''
Amazonに投稿された映画のレビュー(英語)を分析し、レビューがPositiveかNegativeかの判別を行う

Training_data (positive用)、文章数 : 700
Training_data (negative用)、文章数 : 700
Test_data (positive用)、文章数 : 3
Test_data (negative用)、文章数 : 3
（ ※学習用データ：1400、　テスト用データ：6、　合計 : 1406 の文章です

'''
from pathlib import Path
import re

import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Training_data の読み込み
datapath = Path('Training_data')
review_pattern = re.compile(r'cv\d+')  # ファイル名が "cv数字"　で始まるファイル名かを調べる正規表現
test_size = 0.25

data_orig  = dict(neg=[], pos=[])
data_train = dict(neg=[], pos=[])    # 学習データ
data_verify  = dict(neg=[], pos=[])    # 検証データ

# data_origへのデータの読み込み　と　train/verifyへのデータの分割
for cls, review in data_orig.items():
    for path in (datapath / cls).iterdir():
        if review_pattern.match(path.name):
            with open(path, 'r', encoding='latin') as src:
                review.append(src.read())
    print(f"{cls:>8}: loaded {len(review)} reviews.")

    data_train[cls], data_verify[cls] = train_test_split(review, test_size=test_size)

     neg: loaded 700 reviews.
     pos: loaded 700 reviews.


In [3]:
# Test_data の読み込み
datapath_test = Path('Test_data')
review_test_pattern = re.compile(r'amazon_review_\d+')  # ファイル名が "amazon_review_数字"　で始まるファイル名かを調べる正規表現
data_test  = dict(neg=[], pos=[])    # テストデータ

# data_testへのデータの読み込み
for cls, review in data_test.items():
    for path in (datapath_test / cls).iterdir():
        if review_test_pattern.match(path.name):
            with open(path, 'r', encoding='latin') as src:
                review.append(src.read())
    print(f"{cls:>8}: loaded {len(review)} reviews.")

     neg: loaded 3 reviews.
     pos: loaded 3 reviews.


In [4]:
# データの整形用の関数　get_values_and_targets　を定義する
def get_values_and_targets(data):
    values = data['neg'] + data['pos']
    target = [True]*len(data['neg']) + [False]*len(data['pos'])
    target = np.array(target)
    return values, target

# data_trainに対して get_values_and_targets を実行する
values_train, target_train = get_values_and_targets(data_train)

In [5]:
# 読み込んだファイルの中身を表示（テストデータ, Negative, 1つ目）
print(data_test['neg'][0])

The Bad: Disregard for plot points from the lead up movies, including the first Avengers (mentioned, but stripped of all layers). Characters having sudden new and unexplained personalities (bound to happen when they're juggling so damned many!). CGI is a wonderful tool, but when you see Spider-Man's head floating above his CGI body (no he did not get decapitated), it pulls you out of the narrative. The focus of the film was on too many characters who were not relevant to the central plot, maybe the excuse was to have an epic Final Battle scene in Wakanda (SPOILER: Some of those dead characters are needed for a Black Panther sequel, further proving my point).

The Ugly: One character death after another, with so many dying that they often skip to the aftermath or just state that Thanos killed them, it really pulls at the heartstrings... Except, as this review title implies, the movie has zero lasting effects. They wasted no time in throwing out all credibility (SPOILER: Thor caused the 

In [6]:
# データの前処理を指定して、特徴量ベクトルを作成
def nullify_symbols(text):
    for ch in ".,:;!?-+*/=()[]{}<>~^#$@%&'\"_0123456789":
        text = text.replace(ch, ' ')
    return text
def format_words(words, min_length=3):
    return [word.lower() for word in words if len(word) >= min_length]
format_words(nullify_symbols(data_train['neg'][0]).split())
vocab = CountVectorizer(token_pattern=r'[a-zA-Z]{3,}').fit([data_train['neg'][0]])
vocab.transform([data_train['neg'][0]])

<1x274 sparse matrix of type '<class 'numpy.int64'>'
	with 274 stored elements in Compressed Sparse Row format>

In [7]:
# print() で特徴量データを表示
print(vocab.transform([data_train['neg'][0]]))

  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	2
  (0, 5)	1
  (0, 6)	3
  (0, 7)	1
  (0, 8)	1
  (0, 9)	2
  (0, 10)	2
  (0, 11)	14
  (0, 12)	1
  (0, 13)	1
  (0, 14)	7
  (0, 15)	1
  (0, 16)	1
  (0, 17)	1
  (0, 18)	1
  (0, 19)	1
  (0, 20)	2
  (0, 21)	1
  (0, 22)	2
  (0, 23)	1
  (0, 24)	1
  :	:
  (0, 249)	1
  (0, 250)	1
  (0, 251)	1
  (0, 252)	1
  (0, 253)	1
  (0, 254)	1
  (0, 255)	1
  (0, 256)	1
  (0, 257)	1
  (0, 258)	1
  (0, 259)	4
  (0, 260)	1
  (0, 261)	1
  (0, 262)	3
  (0, 263)	1
  (0, 264)	1
  (0, 265)	1
  (0, 266)	4
  (0, 267)	1
  (0, 268)	1
  (0, 269)	1
  (0, 270)	1
  (0, 271)	1
  (0, 272)	1
  (0, 273)	5


In [8]:
# 特徴量を配列形式で表示
vocab    = CountVectorizer(binary=True, token_pattern=r'[a-zA-Z]{3,}')
features = vocab.fit_transform(values_train)
features.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
# 特徴量のshapeを確認
features.shape

(1050, 30982)

In [10]:
# ロジスティックモデルを採用してフィッティングを行う
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='saga', max_iter=1000, random_state=539167).fit(features, target_train)

In [11]:
# data_verifyに対して get_values_and_targets を実行する
values_verify, target_verify = get_values_and_targets(data_verify)

# モデルに分類させる
features_verify = vocab.transform(values_verify)
pred_verify     = model.predict(vocab.transform(values_verify))

In [12]:
# 正答率を確認
validation = (pred_verify  == target_verify )
size    = validation.size
correct = np.count_nonzero(validation)
print(f"{correct}/{size} correct ({correct*100/size:.3f}%)")

283/350 correct (80.857%)


In [13]:
# data_testのカテゴリごとに正答率を確認
for cls in ('neg', 'pos'):
    _values  = data_test[cls]
    _is_spam = [(cls == 'neg')]*len(_values)
    _pred    = model.predict(vocab.transform(_values))
    _valid   = (_pred == _is_spam)
    _size    = _valid.size
    _correct = np.count_nonzero(_valid)
    print(f"{cls:>8s}: {_correct:>3d}/{_size:>3d} correct ({_correct*100/_size:.3f}%)")

     neg:   3/  3 correct (100.000%)
     pos:   3/  3 correct (100.000%)
