## 7.8　より進んだトークン分割、語幹処理、見出語化

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
path = '/content/drive/MyDrive/working'

In [2]:
# !pip install mglearn
# !python -m spacy download en_core_web_sm

In [3]:
import re
import nltk
import spacy
import mglearn
import warnings
import numpy as np
from spacy.tokens import Doc
import matplotlib.pyplot as plt
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedShuffleSplit

In [4]:
warnings.simplefilter('ignore')
plt.rcParams['figure.figsize'] = (5, 5)

In [5]:
loaded_data = {}

for name, cast in [('text_train', str), ('y_train', int), ('text_test', str), ('y_test', int),]:
    with open(f'{path}/{name}.txt', 'r', encoding='utf-8') as f:
        loaded_data[name] = [cast(line.strip()) for line in f]

text_train = loaded_data['text_train']
y_train = loaded_data['y_train']
text_test = loaded_data['text_test']
y_test = loaded_data['y_test']

In [6]:
en_nlp = spacy.load('en_core_web_sm')
stemmer = nltk.stem.PorterStemmer()

In [7]:
def compare_normalization(doc):
  doc_spacy = en_nlp(doc)
  print("Lemmatization:")
  print([token.lemma_ for token in doc_spacy])
  print("Stemming:")
  print([stemmer.stem(token.norm_.lower()) for token in doc_spacy])

In [8]:
compare_normalization("Our meeting today was worse than yesterday, "
                      "I'm scared of meeting the clients tomorrow.")

Lemmatization:
['our', 'meeting', 'today', 'be', 'bad', 'than', 'yesterday', ',', 'I', 'be', 'scared', 'of', 'meet', 'the', 'client', 'tomorrow', '.']
Stemming:
['our', 'meet', 'today', 'wa', 'wors', 'than', 'yesterday', ',', 'i', 'am', 'scare', 'of', 'meet', 'the', 'client', 'tomorrow', '.']


In [9]:
regexp = re.compile(r"(?u)\b\w\w+\b")
en_nlp = spacy.load('en_core_web_sm')

In [10]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("X_train.shape: {}".format(X_train.shape))

X_train.shape: (25000, 27271)


In [11]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.99, train_size=0.01, random_state=0)

grid = GridSearchCV(LogisticRegression(), param_grid, cv=cv)
grid.fit(X_train, y_train)

print("Best cross-validation score (standard CountVectorizer): {:.3f}".format(grid.best_score_))

Best cross-validation score (standard CountVectorizer): 0.719
