### 前処理の実践

In [None]:
#データセットの準備
import string
import pandas as pd
def filter_by_ascii_rate(text, threshold=0.9):
    ascii_letters = set(string.printable)
    rate = sum(c in ascii_letters for c in text) / len(text)
    return rate <= threshold

def load_dataset(filename, n=5000, state=6):
    df = pd.read_csv(filename, sep='\t')

    # extracts Japanese texts.
    is_jp = df.review_body.apply(filter_by_ascii_rate)#日本語率が高いものを抽出
    df = df[is_jp]#bool型でTrueの行のみ抽出
    # sampling.
    df = df.sample(frac=1, random_state=state)  # shuffle
    grouped = df.groupby('star_rating')#'star_rating'でグループ分け
    df = grouped.head(n=n)
    return df.review_body.values, df.star_rating.values


url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz'
x, y = load_dataset(url, n=1000) #url直接指定で読み込み可能

In [None]:
!pip install janome beautifulsoup4

Collecting janome
  Downloading Janome-0.4.1-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 1.3 MB/s 
Installing collected packages: janome
Successfully installed janome-0.4.1


In [9]:
#前処理
from janome.tokenizer import Tokenizer
import re
from  bs4 import BeautifulSoup
t = Tokenizer()

def tokenize(text):
    return t.tokenize(text, wakati=True)

def clean_html(html, strip=False):
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text(strip=strip)
    return text
#各単語の原型を返す
def tokenize_base_form(text):
    tokens = [token.base_form for token in t.tokenize(text)]
    return tokens
#数字を0に統一する
def normalize_number(text, reduce=False):
    if reduce:
        normalized_text = re.sub(r'\d+', '0', text)
    else:
        normalized_text = re.sub(r'\d', '0', text)
    return normalized_text

def remove_url(html):
    soup = BeautifulSoup(html, 'html.parser')
    for a in soup.findAll('a'):
        a.replaceWithChildren()
    return str(soup)

In [1]:
from janome.tokenizer import Tokenizer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ModuleNotFoundError: ignored

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
#学習用の関数
def train_and_eval(x_train, y_train, x_test, y_test,
                   lowercase=False, tokenize=None, preprocessor=None):
    vectorizer = CountVectorizer(lowercase=lowercase, #lowercase：Trueだとすべて小文字でカウント
                                 tokenizer=tokenize, 
                                 preprocessor=preprocessor)
    x_train_vec = vectorizer.fit_transform(x_train)
    x_test_vec = vectorizer.transform(x_test)
    clf = LogisticRegression(solver='liblinear')
    clf.fit(x_train_vec, y_train)
    y_pred = clf.predict(x_test_vec)
    score = accuracy_score(y_test, y_pred)
    print('{:.4f}'.format(score))

In [11]:
#データの用意
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#トークン化のみ
train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize)

0.4020




In [12]:
#htmlを除去した場合
train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, preprocessor=clean_html)

0.4090


