<a href="https://colab.research.google.com/github/tomoyaima/Atcoder/blob/master/04_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%tensorflow_version 2.x

In [2]:
!pip install janome beautifulsoup4

Collecting janome
[?25l  Downloading https://files.pythonhosted.org/packages/79/f0/bd7f90806132d7d9d642d418bdc3e870cfdff5947254ea3cab27480983a7/Janome-0.3.10-py2.py3-none-any.whl (21.5MB)
[K     |████████████████████████████████| 21.5MB 1.3MB/s 
Installing collected packages: janome
Successfully installed janome-0.3.10


In [3]:
!wget https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz -P data/
!gunzip -d data/amazon_reviews_multilingual_JP_v1_00.tsv.gz

--2020-08-14 07:48:25--  https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.245.190
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.245.190|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94688992 (90M) [application/x-gzip]
Saving to: ‘data/amazon_reviews_multilingual_JP_v1_00.tsv.gz’


2020-08-14 07:48:27 (58.2 MB/s) - ‘data/amazon_reviews_multilingual_JP_v1_00.tsv.gz’ saved [94688992/94688992]



# utils.py

In [4]:
import string
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


def filter_by_ascii_rate(text, threshold=0.9):
    ascii_letters = set(string.printable)
    rate = sum(c in ascii_letters for c in text) / len(text)
    return rate <= threshold


def load_dataset(filename, n=5000, state=6):
    df = pd.read_csv(filename, sep='\t')

    # extracts Japanese texts.
    is_jp = df.review_body.apply(filter_by_ascii_rate)
    df = df[is_jp]

    # sampling.
    df = df.sample(frac=1, random_state=state)  # shuffle
    grouped = df.groupby('star_rating')
    df = grouped.head(n=n)
    return df.review_body.values, df.star_rating.values


def train_and_eval(x_train, y_train, x_test, y_test,
                   lowercase=False, tokenize=None, preprocessor=None):
    vectorizer = CountVectorizer(lowercase=lowercase,
                                 tokenizer=tokenize,
                                 preprocessor=preprocessor)
    x_train_vec = vectorizer.fit_transform(x_train)
    x_test_vec = vectorizer.transform(x_test)
    clf = LogisticRegression(solver='liblinear')
    clf.fit(x_train_vec, y_train)
    y_pred = clf.predict(x_test_vec)
    score = accuracy_score(y_test, y_pred)
    print('{:.4f}'.format(score))


# preprocessing.py

In [5]:
"""
Preprocessings.
"""
import re

from bs4 import BeautifulSoup
from janome.tokenizer import Tokenizer
t = Tokenizer()


def clean_html(html, strip=False):
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text(strip=strip)
    return text


def tokenize(text):
    return t.tokenize(text, wakati=True)


def tokenize_base_form(text):
    tokens = [token.base_form for token in t.tokenize(text)]
    return tokens


def normalize_number(text, reduce=False):
    if reduce:
        normalized_text = re.sub(r'\d+', '0', text)
    else:
        normalized_text = re.sub(r'\d', '0', text)
    return normalized_text


def truncate(sequence, maxlen):
    return sequence[:maxlen]


def remove_url(html):
    soup = BeautifulSoup(html, 'html.parser')
    for a in soup.findAll('a'):
        a.replaceWithChildren()
    return str(soup)


# train.py

In [8]:
from sklearn.model_selection import train_test_split


def main():
    x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv', n=1000)

    x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                        test_size=0.2,
                                                        random_state=42)
    #単語分割のみ
    print('Tokenization only.')
    train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize)
    #単語の分割+クリーニング
    print('Clean html.')
    train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, preprocessor=clean_html)
    #単語の分割+数字の正規化
    print('Normalize number.')
    train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, preprocessor=normalize_number)

    print('Base form.')
    train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize_base_form)

    print('Lower text.')
    train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, lowercase=True)

    print('Base form & Clean')
    train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize_base_form, preprocessor=clean_html)

    print('Base form & Normalize number ')
    train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize_base_form, preprocessor=normalize_number)


if __name__ == '__main__':
    main()


Tokenization only.




0.4020
Clean html.
0.4090
Normalize number.
0.3940
Base form.
0.3930
Lower text.
0.3980
Base form & Clean
0.3940
Base form & Normalize number 
0.3930
