<a href="https://colab.research.google.com/github/yujiimt/NLP/blob/master/book/feature-estimator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!wget https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz -P data/
!gunzip -d data/amazon_reviews_multilingual_JP_v1_00.tsv.gz

--2020-04-28 08:37:19--  https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.146.69
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.146.69|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94688992 (90M) [application/x-gzip]
Saving to: ‘data/amazon_reviews_multilingual_JP_v1_00.tsv.gz’


2020-04-28 08:37:22 (28.6 MB/s) - ‘data/amazon_reviews_multilingual_JP_v1_00.tsv.gz’ saved [94688992/94688992]



In [0]:
import string
import pandas as pd

def filter_by_ascii_rate(text, threshold = 0.9):
  ascii_letters = set(string.printable)
  rate = sum(c in ascii_letters for c in text) / len(text)
  return rate <= threshold


def load_dataset(filename, n=5000, state = 6):
  df = pd.read_csv(filename, sep = '\t')

  # マルチクラスに変更
  mapping = {1: 0, 2: 0, 4: 1, 5: 1}
  df = df[df.star_rating != 3]
  df.star_rating = df.star_rating.map(mapping)

  #日本語処理
  is_jp = df.review_body.apply(filter_by_ascii_rate)
  df = df[is_jp]

  # sampling
  df = df.sample(frac = 1, random_state = state)
  grouped = df.groupby('star_rating')
  df = grouped.head(n=n)
  return df.review_body.values, df.star_rating.values

In [11]:
!pip install janome
from bs4 import BeautifulSoup
from janome.tokenizer import Tokenizer

t = Tokenizer(wakati = True)



In [0]:
def clean_html(html, strip = False):
  soup = BeautifulSoup(html, 'html.parser')
  text = soup.get_text(strip =  strip)
  return text
  
def tokenize(text):
  return t.tokenize(text)

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


def train_and_eval(x_train, y_train, x_test, y_test, vectorizer):
  x_train_vec = vectorizer.fit_transform(x_train)
  x_test_vec = vectorizer.transform(x_test)
  clf = LogisticRegression(solver = 'liblinear')
  clf.fit(x_train_vec, y_train)
  y_pred = clf.predict(x_test_vec)
  score = accuracy_score(y_test, y_pred)
  print('{:.4f}'.format(score))

In [21]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

def main():
  x,y = load_dataset('/content/data/amazon_reviews_multilingual_JP_v1_00.tsv', n = 5000)

  print('Tokenization')
  x = [clean_html(text, strip = True) for text in x]
  x = [' '.join(tokenize(text)) for text in x]
  x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)


  print("Binary")
  vectorizer = CountVectorizer(binary = True)
  train_and_eval(x_train, y_train, x_test, y_test, vectorizer)

  print("Counter")
  vectorizer = CountVectorizer(binary = False)
  train_and_eval(x_train, y_train, x_test, y_test, vectorizer)

  print('TF-IDF')
  vectorizer = TfidfVectorizer(ngram_range=(1,2))
  train_and_eval(x_train, y_train, x_test, y_test, vectorizer)

if __name__ == '__main__' :
    main()

Tokenization
Binary
0.8385
Counter
0.8365
TF-IDF
0.8545
