<a href="https://colab.research.google.com/github/yujiimt/NLP/blob/master/book/NLP_prepro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!wget https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz -P data/
!gunzip -d data/amazon_reviews_multilingual_JP_v1_00.tsv.gz

--2020-04-28 05:58:58--  https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.85.77
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.85.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94688992 (90M) [application/x-gzip]
Saving to: ‘data/amazon_reviews_multilingual_JP_v1_00.tsv.gz.1’


2020-04-28 05:58:59 (79.6 MB/s) - ‘data/amazon_reviews_multilingual_JP_v1_00.tsv.gz.1’ saved [94688992/94688992]



In [0]:
import os.path, urllib.request as request

In [0]:
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz"
localfile = "amazon_reviews_multilingual_JP_v1_00.tsv"

if not os.path.exists(localfile):
  print("ファイルをダウンロード")
  request.urlretrieve(url, localfile)

ファイルをダウンロード


In [0]:
import string
import pandas as pd

def filter_by_ascii_rate(text, threshold = 0.9):
  ascii_letters = set(string.printable)
  rate = sum(c in ascii_letters for c in text) / len(text)
  return rate <= threshold

def load_dataset(filename, n=5000, state=6):
  df = pd.read_csv(filename, sep = "\t")

  # extracts Japanese texts

  is_jp = df.review_body.apply(filter_by_ascii_rate)
  df = df[is_jp]

  # sampling
  df = df.sample(frac=1, random_state = state)
  grouped = df.groupby("star_rating")
  df = grouped.head(n=n)
  return df.review_body.values, df.star_rating.values

In [0]:
!pip install janome

import re

from bs4 import BeautifulSoup
from janome.tokenizer import Tokenizer



In [0]:
t = Tokenizer()

def clean_html(html, strip = False):
  soup = BeautifulSoup(html, 'html.parser')
  text = soup.get_text(strip=strip)
  return text

def tokenize(text):
  return t.tokenize(text, wakati = True)

def tokenize_base_form(text):
  tokens = [token.base_form for token in t.tokenize(text)]
  return tokens

def normalize_number(text, reduce = False):
  if reduce:
    normalized_text = re.sub(r'\d+', '0', text)
  else:
    normalized_text = re.sub(r'\d', '0', text)
  return normalized_text

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def train_and_eval(x_train, y_train, x_test, y_test, lowercase = False, tokenize = None, preprocessor = None):
  vectorizer = CountVectorizer(lowercase = lowercase,
                               tokenizer = tokenize,
                               preprocessor = preprocessor)
  x_train_vec = vectorizer.fit_transform(x_train)
  x_test_vec = vectorizer.transform(x_test)
  clf = LogisticRegression(solver = 'liblinear')
  clf.fit(x_train_vec, y_train)
  y_pred = clf.predict(x_test_vec)
  score = accuracy_score(y_test, y_pred)
  print('{:.4f}'.format(score))

In [0]:
from sklearn.model_selection import train_test_split

def main():
  x,y = load_dataset('/content/data/amazon_reviews_multilingual_JP_v1_00.tsv', n = 1000)
  x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)
  
  print('Tokenization only')
  train_and_eval(x_train, y_train, x_test, y_test, tokenize = tokenize)
  
  print('Clean html')
  train_and_eval(x_train, y_train, x_test, y_test, tokenize = tokenize, preprocessor = clean_html)

  print('Normalize number')
  train_and_eval(x_train, y_train, x_test, y_test, tokenize = tokenize, preprocessor = normalize_number)

  print("Base form")
  train_and_eval(x_train, y_train, x_test, y_test, tokenize = tokenize_base_form)

  print("Lower text")
  train_and_eval(x_train, y_train, x_test, y_test, tokenize = tokenize, lowercase = True)


if __name__ == '__main__':
    main()


Tokenization only




0.4020
Clean html
0.4090
Normalize number
0.3940
Base form
0.3930
Lower text
0.3980
