In [None]:
# 日本語のAmazonのレビュー（の一部）をダウンロード・解凍します
!mkdir data
!wget https://github.com/tealgreen0503/introduction_to_nlp/raw/master/data/amazon_reviews_multilingual_JP_v1_00_20000.tsv.gz -P data/
!gunzip -d ./data/amazon_reviews_multilingual_JP_v1_00_20000.tsv.gz

--2020-05-16 09:04:29--  https://github.com/tealgreen0503/introduction_to_nlp/raw/master/data/amazon_reviews_multilingual_JP_v1_00_20000.tsv.gz
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/tealgreen0503/introduction_to_nlp/master/data/amazon_reviews_multilingual_JP_v1_00_20000.tsv.gz [following]
--2020-05-16 09:04:29--  https://raw.githubusercontent.com/tealgreen0503/introduction_to_nlp/master/data/amazon_reviews_multilingual_JP_v1_00_20000.tsv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7451914 (7.1M) [application/octet-stream]
Saving to: ‘data/amazon_reviews_multilingual_JP_v1_00_20000.ts

In [None]:
#!sudo sed -i -e 's|disco|eoan|g' /etc/apt/sources.list
#!sudo apt update

In [None]:
!apt-get install mecab
!apt-get install libmecab-dev
!apt-get install mecab-ipadic-utf8
!apt-get -q -y install swig
# ここまでMeCabのインストール
#!apt-get install file
#!git clone https://github.com/neologd/mecab-ipadic-neologd.git
#!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n
# ここまでNEologdのインストール
!pip install mecab-python3
# MeCabをPythonで使うためのツールのインストール

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libmecab2 mecab-jumandic mecab-jumandic-utf8 mecab-utils
The following NEW packages will be installed:
  libmecab2 mecab mecab-jumandic mecab-jumandic-utf8 mecab-utils
0 upgraded, 5 newly installed, 0 to remove and 31 not upgraded.
Need to get 16.5 MB of archives.
After this operation, 219 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libmecab2 amd64 0.996-5 [257 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 mecab-utils amd64 0.996-5 [4,856 B]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 mecab-jumandic-utf8 all 7.0-20130310-4 [16.2 MB]
Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 mecab-jumandic all 7.0-20130310-4 [2,212 B]
Get:5 http://archive.ubuntu.com/ubuntu bionic/universe amd64 mecab amd64 0.996-5 [132 kB]
Fetched 16.5 MB in 1

In [None]:
import re
import urllib
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup

import MeCab

# 前処理関数のリスト
# ここから好きなものを使ってみましょう

# HTML除去
def clean_html(html, strip=True):
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(strip=strip)
    return text

# 単語分割
class Tokenizer:
    def __init__(self, neologd=False):
        if neologd:
            self.mecab_tagger = MeCab.Tagger(
                "-Owakati -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd")
        else:
            self.mecab_tagger = MeCab.Tagger("-Owakati")

    def __call__(self, text):
        text_tokenized = self.mecab_tagger.parse(text)
        text_tokenized = text_tokenized.strip().split()

        return text_tokenized

# 数字の統一
def normalize_number(text, reduce=False):
    if reduce:
        normalized_text = re.sub(r"\d+", "0", text)
    else:
        normalized_text = re.sub(r"\d", "0", text)
    return normalized_text

# ストップワードの除去
def remove_stopwords(text, stopwords):
    text = [w for w in text if w not in stopwords]
    return text

# 小文字化
def lower_case(text):
    return text.lower()

In [None]:
def load_dataset(filename, n=10000):
    df = pd.read_csv(filename, sep='\t')
    df = df[["review_body", "binay_star_rating"]]
    df = df[:n]
    return df.review_body.values, df.binay_star_rating.values

In [None]:
def train_and_eval(x_train, y_train, x_test, y_test, x_data=None):

    '''
    scikit-learnの使い方
    1. クラスをオブジェクト化              # clf = LogisticRegression(solver='liblinear')
    2. fit(x, y)メソッドを呼び出して学習   # clf.fit(x_train_vec, y_train)
    3. テストデータに対して予測する        # y_pred = clf.predict(x_test_vec)
    4. スコアを計算                        # score = accuracy_score(y_test, y_pred)
    '''

    clf = LogisticRegression(solver='liblinear', C=1.0)  # Cの値を変えてみよう
    clf.fit(x_train_vec, y_train)
    y_pred = clf.predict(x_test_vec)
    score = accuracy_score(y_test, y_pred)
    print('{:.4f}'.format(score))

    if x_data is not None:
        y_pred = clf.predict(x_data)
        print("prediction:", y_pred)

In [None]:
# Tokenizerのオブジェクト化
tokenize = Tokenizer()
# 辞書からストップワードリストを作成
slothlib_path = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
slothlib_file = urllib.request.urlopen(slothlib_path)  # URLを開く
slothlib_stopwords = [line.decode("utf-8").strip() for line in slothlib_file]  # ストップワードをリストに格納
slothlib_stopwords = [ss for ss in slothlib_stopwords if not ss==u'']  # 空行を削除
slothlib_stopwords = set(slothlib_stopwords)

In [None]:
filename = "./data/amazon_reviews_multilingual_JP_v1_00_20000.tsv"
x, y = load_dataset(filename, n=20000)
x = [clean_html(text, strip=True) for text in x]  # HTMLタグの削除
#x = [normalize_number(text) for text in x]
x = [tokenize(text) for text in x]                # 単語分割（出力：list）
# ここで何かしらの前処理を挟んでみましょう
# ストップワード，小文字化，数字の統一 etc.
#x = [remove_stopwords(text, slothlib_stopwords) for text in x]
x = [" ".join(text) for text in x]

In [None]:
print('Binary')
vectorizer = CountVectorizer(binary=True)
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)
train_and_eval(x_train_vec, y_train, x_test_vec, y_test)

Binary
0.8273


In [None]:
print('Count')
vectorizer = CountVectorizer(binary=False)
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)
train_and_eval(x_train_vec, y_train, x_test_vec, y_test)

Count
0.8300


In [None]:
print('TF-IDF')
vectorizer = TfidfVectorizer()
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)
train_and_eval(x_train_vec, y_train, x_test_vec, y_test)

TF-IDF
0.8390


In [None]:
print('Bigram')
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)
train_and_eval(x_train_vec, y_train, x_test_vec, y_test)

Bigram
0.8383


In [None]:
# できたら自分で作った文章がどちらに分類されるか試してみましょう（難しめ）