<a href="https://colab.research.google.com/github/yujiimt/NLP/blob/master/book/scale.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
from sklearn.preprocessing import MinMaxScaler


data = [[-1,2],[-0.5,6],[0,10],[1,18]]
scaler = MinMaxScaler()
scaler.fit_transform(data)

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [13]:
from sklearn.preprocessing import StandardScaler

data = [[0, 10],[0, 15], [1, 20], [1, 25]]
scaler = StandardScaler()
scaler.fit_transform(data)

array([[-1.        , -1.34164079],
       [-1.        , -0.4472136 ],
       [ 1.        ,  0.4472136 ],
       [ 1.        ,  1.34164079]])

In [14]:
!wget https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz -P data/
!gunzip -d data/amazon_reviews_multilingual_JP_v1_00.tsv.gz

--2020-04-30 02:46:40--  https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.76.110
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.76.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94688992 (90M) [application/x-gzip]
Saving to: ‘data/amazon_reviews_multilingual_JP_v1_00.tsv.gz’


2020-04-30 02:46:41 (81.2 MB/s) - ‘data/amazon_reviews_multilingual_JP_v1_00.tsv.gz’ saved [94688992/94688992]

gzip: data/amazon_reviews_multilingual_JP_v1_00.tsv already exists; do you wish to overwrite (y or n)? ^C


In [15]:
import string
import pandas as pd
!pip install janome
from bs4 import BeautifulSoup
from janome.tokenizer import Tokenizer

t = Tokenizer(wakati = True)

def filter_by_ascii_rate(text, threshold = 0.9):
  ascii_letters = set(string.printable)
  rate = sum(c in ascii_letters for c in text) / len(text)
  return rate <= threshold


def load_dataset(filename, n=5000, state = 6):
  df = pd.read_csv(filename, sep = '\t')

  # マルチクラスに変更
  mapping = {1: 0, 2: 0, 4: 1, 5: 1}
  df = df[df.star_rating != 3]
  df.star_rating = df.star_rating.map(mapping)

  #日本語処理
  is_jp = df.review_body.apply(filter_by_ascii_rate)
  df = df[is_jp]

  # sampling
  df = df.sample(frac = 1, random_state = state)
  grouped = df.groupby('star_rating')
  df = grouped.head(n=n)
  return df.review_body.values, df.star_rating.values
def clean_html(html, strip = False):
  soup = BeautifulSoup(html, 'html.parser')
  text = soup.get_text(strip =  strip)
  return text
  
def tokenize(text):
  return t.tokenize(text)

Collecting janome
[?25l  Downloading https://files.pythonhosted.org/packages/79/f0/bd7f90806132d7d9d642d418bdc3e870cfdff5947254ea3cab27480983a7/Janome-0.3.10-py2.py3-none-any.whl (21.5MB)
[K     |████████████████████████████████| 21.5MB 1.2MB/s 
[?25hInstalling collected packages: janome
Successfully installed janome-0.3.10


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def main():
  print('Loading ..... ')
  x, y = load_dataset('/content/data/amazon_reviews_multilingual_JP_v1_00.tsv', n = 5000)
  x = [clean_html(text, strip = True) for text in x]
  x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)


  print('Vectorizing....')
  vectorizer = CountVectorizer(tokenizer = tokenize)
  x_train = vectorizer.fit_transform(x_train)
  x_test = vectorizer.transform(x_test)
  print(x_train.shape)
  print(x_test.shape)

  print('selecting feature ....')
  selector = SelectKBest(k = 7000, score_func = mutual_info_classif)
  selector.fit(x_train, y_train)
  x_train_new = selector.transform(x_train)
  x_test_new = selector.transform(x_test)
  print(x_train_new.shape)
  print(x_test_new.shape)

  print("Evaluting .... ")
  clf = LogisticRegression(solver = 'liblinear')
  clf.fit(x_train_new, y_train)
  y_pred = clf.predict(x_test_new)
  score = accuracy_score(y_test, y_pred)
  print('{:.4f}'.format(score))

if __name__ == '__main__':
    main()

Loading ..... 
Vectorizing....
(8000, 40980)
(2000, 40980)
selecting feature ....
(8000, 7000)
(2000, 7000)
Evaluting .... 
0.8370
