<a href="https://colab.research.google.com/github/tomonari-masada/course2021-nlp/blob/main/assignment_04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 課題４
* 春学期に習った分類手法を使って、IMDbデータセットの感情分析をしてみよう。
 * training set / test setの分割は、そのまま使う。
 * training setをどのように使うかはお任せします。（交差検証など。）
 * test setでの分類性能をArea under the ROC curveで報告。

## fasttextの単語ベクトルを使う

* 授業で紹介したデータファイルを読み込む
 * fasttextの単語ベクトルを使ってIMDbデータセットの各文書をベクトル化したデータ

In [None]:
import numpy as np

PATH = '/content/drive/MyDrive/2021Courses/NLP/'

texts = dict()
labels = dict()
for tag in ['train', 'test']:
  with open(f'{PATH}{tag}.npy', 'rb') as f:
    texts[tag] = np.load(f)
  with open(f'{PATH}{tag}_labels.npy', 'rb') as f:
    labels[tag] = np.load(f)

In [None]:
from sklearn.linear_model import LogisticRegression

random_state = np.random.RandomState(0)

for C in [0.1, 1, 10, 100]:
  classifier = LogisticRegression(C=C, random_state=random_state, max_iter=1000)
  roc_auc = cross_val_score(classifier, texts['train'], labels['train'], cv=5, scoring='roc_auc').mean()
  print(f'C={C} | roc auc : {roc_auc:.4f}')

In [None]:
for C in [200, 500, 1000]:
  classifier = LogisticRegression(C=C, random_state=random_state, max_iter=1000)
  roc_auc = cross_val_score(classifier, texts['train'], labels['train'], cv=5, scoring='roc_auc').mean()
  print(f'C={C} | roc auc : {roc_auc:.4f}')

In [None]:
from sklearn.metrics import roc_auc_score

classifier = LogisticRegression(C=200, random_state=random_state, max_iter=1000)
classifier.fit(texts['train'], labels['train'])
print(f"test roc auc : {roc_auc_score(labels['test'], classifier.predict_proba(texts['test'])[:, 1]):.4f}")

## TF-IDFで文書をベクトル化する

* IMDbデータセットのテキストを取得し直す

In [None]:
!pip install ml_datasets

In [None]:
from ml_datasets import imdb
train_data, test_data = imdb()

In [None]:
train_texts, train_labels = zip(*train_data)
test_texts, test_labels = zip(*test_data)

In [None]:
train_texts[0]

* TF-IDFで文書ベクトルを得る　

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=10, max_df=0.2)
vectorizer.fit(train_texts)
X = vectorizer.transform(train_texts)
X_test = vectorizer.transform(test_texts)

In [None]:
print(X.shape, X_test.shape)

In [None]:
for C in [0.01, 0.1, 1, 10]:
  classifier = LogisticRegression(C=C, random_state=random_state, max_iter=1000)
  mean_roc_auc = cross_val_score(classifier, X, train_labels, cv=5, scoring='roc_auc').mean()
  print(f'C={C} | roc auc : {roc_auc:.4f}')

In [None]:
for C in [100, 200]:
  classifier = LogisticRegression(C=C, random_state=random_state, max_iter=1000)
  mean_roc_auc = cross_val_score(classifier, X, train_labels, cv=5, scoring='roc_auc').mean()
  print(f'C={C} | roc auc : {roc_auc:.4f}')

In [None]:
from sklearn.metrics import roc_auc_score

classifier = LogisticRegression(random_state=random_state, max_iter=1000)
classifier.fit(X, train_labels)
print(f"test roc auc : {roc_auc_score(test_labels, classifier.predict_proba(X_test)[:, 1]):.4f}")