## NLP
使用するライブラリやメソッドを読み込む  
グローバル変数を宣言

In [None]:
import gc
import numpy as np
import pandas as pd
from itertools import chain
from tqdm import tqdm
import datetime
import re
import glob
import sys
import pickle
import os
from gensim import corpora, matutils
HOME = os.path.expanduser('~')

# Original Library
sys.path.append(f'{HOME}/kaggle/data_analysis/library/')
import utils
from pararell_utils import pararell_process

# NLP Library
from nlp_utils import stems, corpus_word_id
from wordnet import search_similar_words

logger = utils.logger_func()
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)

def get_pararell_stems(args):
    return stems(**args)


def get_pararell_words_dict(i, tx):
    tmp_dict = {}
    words_list = []
    arg_list = []
    for morph in morph_list:
        arg_list.append({'text':tx, 'morph':morph, 'regex':True})
    stem_list = pararell_process(get_pararell_stems, arg_list)
    word_list = list(set(list(chain.from_iterable(stem_list))))

    tmp_dict[i] = word_list

    return tmp_dict

#========================================================================
# Global Variables
#========================================================================
key = 'unique_id'
target = 'target'
ignore_list = [key, target]
regex_abcABC123 = re.compile(u"[0-9０-９a-zA-Z]")
morph_list = ['名詞', '形容詞', '動詞']
#  morph_list = ['名詞']

### データセットロード

In [None]:
year = 2015
train = pd.read_excel(f'../input/jr_train{year}.xls')[['unique_id', 'contents', 'date', 'target']].dropna()
train_2016 = pd.read_excel('../input/jr_train2016.xls')[['unique_id', 'contents', 'date', 'target']].dropna()
train = pd.concat([train, train_2016], axis=0)
test = pd.read_excel('../input/jr_test2017.xls')[['unique_id', 'contents', 'date', 'target']].dropna()
train[key] = np.arange(len(train))
test[key] = np.arange(len(test))

print(train.shape)
print(train.head())
print(test.shape)
print(test.head())

**unique_id別にテキストを形態素解析にかけ、指定した品詞の単語リストを作成する**  
**合わせて全テキストでの単語カウントを行う**

In [2]:
save = True

def get_stems(df):
    id_word_dict = {}
    #========================================================================
    # dictにunique_id別の単語リストが格納される
    #========================================================================
    tx_list = df['contents'].values
    id_list = df['unique_id'].values
    for i, tx in zip(id_list, tx_list):
        logger.info(i)
        id_word_dict.update(get_pararell_words_dict(i, tx))
        
    return id_word_dict
    
""" ここで作成したtrain_word_dictは後で使用する """
train_word_dict = get_stems(train)
test_word_dict = get_stems(test)
utils.to_pkl_gzip(obj=train_word_dict, path=f'../input/train_word_split_{year}_2016')
utils.to_pkl_gzip(obj=test_word_dict, path=f'../input/test_word_split_2017')

NameError: name 'sys' is not defined

#### 単語の出現頻度をカウントして保存→これを見てキーワードを選別

In [None]:
save=False
train_word_dict = utils.read_pkl_gzip(path=f'../input/train_word_split_{year}.gz')
test_word_dict = utils.read_pkl_gzip(path=f'../input/test_word_split_2017.gz')

id_list = []
tmp_list = []
result = pd.DataFrame()
for ui, texts in train_word_dict.items():
    tmp = pd.Series(texts, name='word').to_frame()
    tmp['unique_id'] = ui
    if len(result):
        result = pd.concat([result, tmp], axis=0)
    else:
        result = tmp.copy()
id_word = result.drop_duplicates()
# df_cnt = result.groupby('word').size().reset_index().rename(columns={0:'word_freq'}).sort_values(by='word_freq', ascending=False)
# df_cnt = result.merge(id_word, on='word', how='inner').merge(train[[key, target]], on=key, how='inner')
df_cnt = result.merge(train[[key, target]], on=key, how='inner')
if save:
    df_cnt.to_csv(f'../output/{start_time[:11]}_jrw_train_word_count.csv', index=False)

logger.info(f"\nComplete Get Words From Texts!!")
display(df_cnt.shape)
display(df_cnt.head(10))

#### 作成したキーワードリストを読み込む
#### 単語リストをキーワードのみに絞る
#### 前の実行部分で取得したid_word_dictを使用しないとunique_idと対応させるのが手間

In [None]:
def get_keyword_list(keyword_path):
    #========================================================================
    # 読み込みキーワードリストはcsvでヘッダーなしの1行を想定
    #========================================================================
    keyword_path = glob.glob(keyword_path)
    keyword_list = []
    for path in keyword_path:
        tmp = list(pd.read_csv(path, header=None).iloc[:, 0].values)
        keyword_list += tmp
    keyword_list = list(set(keyword_list))
    return keyword_list

def keyword_filter(id_word_dict, keyword_list):
    id_keyword_dict = {}
    for i, word_list in tqdm(id_word_dict.items()):
        new_id_keyword_dict = {}
        word_list = list(set(word_list) & set(keyword_list))
        id_keyword_dict[i] = word_list
    return id_keyword_dict

keyword_path = '../keyword/*.csv'
keyword_list = get_keyword_list(keyword_path)
# keyword_list = pd.read_csv('../keyword/20181114_08_jrw_train_key.csv', header=None)[0].values

# 元データの単語からキーワードのみ残す
train_keyword_dict = keyword_filter(train_word_dict, keyword_list)
print(train_keyword_dict)

#### 各テキストの単語リストをループで流し、各単語の類語を取得していく
#### WordNet DBを使用
#### 類語リストを出力したら、再度キーワードリストを作成する

In [None]:
save = True
test_word = 'うるさい'
wn_test = True
wn_test = False
def get_similar_word(id_word_dict, keyword_list, original=False):
    sim_dict = {}
    no_wn_list = []
    for i, word_list in tqdm(id_word_dict.items()):
        tmp_list = []
        feature_keywords = []
    
        for word in word_list:
            # WordNet DB Test
            if wn_test:
                similar_words = search_similar_words(test_word, syn_num=3, smw_num=10, display=True)
                sys.exit()
            similar_words = search_similar_words(word, syn_num=3, smw_num=10, display=False)
    
            # 類語がない（DBに登録がない）単語も存在する
            if similar_words==0:
                no_wn_list.append(word)
                continue
            tmp_list.append(similar_words)
    
        # nestしてるlistを1次元にする
        feature_words = list(chain.from_iterable(tmp_list))
        # original=Trueの場合、元の単語リストと類語リストを合わせてデータセットに格納する
        if original:
            feature_words = list(set(feature_words) & set(keyword_list)) + word_list
        # テキスト別に類語リストを辞書へ格納
        sim_dict[i] = feature_words
        
    logger.info(f"Complete Get Similar Words!!")
    
    return sim_dict, list(set(no_wn_list))

# 類語のみのリストを出力
train_similarword_dict, no_wn_list = get_similar_word(train_keyword_dict, keyword_list)

#========================================================================
# 取得した類語リストの保存
#========================================================================
if save:
    # 類語リスト
    tmp_list = [word_list for word_list in train_similarword_dict.values()]
    similar_keywords = list(set(list(chain.from_iterable(tmp_list))))
    print(f"Count Similar Word: {len(similar_keywords)}")
    df_cnt = pd.Series(similar_keywords, name='word').sort_values(ascending=False)
    df_cnt.to_csv(f'../output/{start_time[:11]}_similar_word_list.csv', index=False)
    # 類語のなかった単語リスト
    df_no_similar = pd.Series(no_wn_list, name='word')
    df_no_similar.to_csv('../output/{start_time[:11]_jrw_no_similar_word.csv}', index=False)
    
    display(df_cnt)

**類語から作成したキーワードリストを追加で読み込む**  
**単語リストをキーワードのみに絞る**  
**前の実行部分で取得したtrain_word_dictを使用しないとunique_idと対応させるのが手間**  

In [None]:
save = True
new_keyword_list = get_keyword_list(keyword_path)

new_train_dict = keyword_filter(train_word_dict, new_keyword_list)
new_test_dict = keyword_filter(test_word_dict, new_keyword_list)
        
#========================================================================
# 今回の類語リストでは、キーワードのみを残す
# 各テキストの単語リストをループで流し、各単語の類語を取得していく
# WordNet DB
# ========================================================================
# keyword
new_train_dict, train_no_wn_list = get_similar_word(new_train_dict, new_keyword_list, original=True)
new_test_dict, _ = get_similar_word(new_test_dict, new_keyword_list, original=True)

#========================================================================
# 取得した類語リストの保存
#========================================================================
if save:
    train = pd.Series(new_train_dict, name='word').to_frame().reset_index().rename(columns={'index':'unique_id'})
    test = pd.Series(new_test_dict, name='word').to_frame().reset_index().rename(columns={'index':'unique_id'})
    
    train_path = f'../output/{start_time[:11]}_train_dataset'
    test_path = f'../output/{start_time[:11]}_test_dataset'
    utils.to_pkl_gzip(obj=train, path=train_path)
    utils.to_pkl_gzip(obj=test, path=test_path)
    print("Train")
    display(train.head(10))
    print("Test")
    display(test.head(10))
    # 類語のなかった単語リスト
    df_no_similar = pd.Series(no_wn_list, name='word')
    df_no_similar.to_csv('../output/{start_time[:11]_jrw_no_similar_allword.csv}', index=False)

### 機械学習モデルに入力するため、Bag of Wordsに変換する為のgensimオブジェクトを作成

In [None]:
from gensim import corpora
from nlp_utils import word_dense

# Dataset Load
train_dict = utils.read_pkl_gzip(train_path+'.gz').dropna()
train_dict = train_dict.set_index('unique_id')['word'].to_dict()
# train_dict['word'] = train_dict['word'].map(lambda x: x.replace('[', '').replace(']', '').replace('"', '').replace("'", '').replace(' ', '').split(','))
# test_dict = pd.read_csv('../output/20181113_15_test_dataset.csv')
test_dict = utils.read_pkl_gzip(test_path+'.gz').dropna()
test_dict = test_dict.set_index('unique_id')['word'].to_dict()
# test_dict['word'] = test_dict['word'].map(lambda x: x.replace('[', '').replace(']', '').replace('"', '').replace("'", '').replace(' ', '').split(','))
dict_path = f'../output/{start_time[:11]}_jrw_gensim_dict_2016.txt'

# 全ワードでgensim Dictionaryを作成し、Bag of Wordsを出力できるようにする
# 辞書は保存しておく
all_words = []
all_words.append(list(set(list(chain.from_iterable(list(train_dict.values()))))))
logger.info("Gensim make dictionary Start!!")
gs_dict = corpora.Dictionary(all_words)
gs_dict.save_as_text(dict_path)

### 単語リストになっている特徴セットをBag of Wordsでベクトル変換
#### 合わせてdate系のfeatureも作成する

In [None]:
gs_dict = corpora.Dictionary.load_from_text(dict_path)
raw_train = pd.read_excel('../input/jr_train2015.xls')[['unique_id', 'date', 'contents', 'target']].dropna()
# raw_train2016 = pd.read_excel('../input/jr_train2016.xls')[['unique_id', 'date', 'contents', 'target']].dropna()
# raw_train = pd.concat([raw_trian, raw_train2016], axis=0)
raw_test = pd.read_excel('../input/jr_test2017.xls')[['unique_id', 'date', 'contents', 'target']].dropna()


def make_featureset(words_dict, gs_dict):
    '''
    Explain:
        dict型で単語リストをもったデータセットをBoWに変換する
    Args:
        words_dict(dict): key: unique_id, value: テキストを分かち書きした単語リスト
    Return:
    '''
    dense_list = []
    for word_list in words_dict.values():
        if len(word_list)==0:
            word_list.append('')
        dense = word_dense(list(word_list), gs_dict)
        dense_list.append(dense)

    df = pd.DataFrame(dense_list, index=words_dict.keys()).reset_index().rename(columns={'index':key})
    df.columns = [gs_dict[col]  if col not in ignore_list else col for col in df.columns]
    return df


def feature_month(df):
    df['month'] = df['date'].map(lambda x: int(str(x.strftime("%m"))))
    df['max_temperature'] = df['month'].map(lambda x:
                                                9 if x==1
                                                else 10 if x==2
                                                else 13 if x==3
                                                else 20 if x==4
                                                else 24 if x==5
                                                else 27 if x==6
                                                else 31 if x==7
                                                else 33 if x==8
                                                else 29 if x==9
                                                else 23 if x==10
                                                else 17 if x==11
                                                else 12 if x==12
                                                else 20
                                                )
    df['min_temperature'] = df['month'].map(lambda x:
                                                2 if x==1
                                                else 2 if x==2
                                                else 5 if x==3
                                                else 10 if x==4
                                                else 15 if x==5
                                                else 20 if x==6
                                                else 24 if x==7
                                                else 25 if x==8
                                                else 21 if x==9
                                                else 15 if x==10
                                                else 9 if x==11
                                                else 4 if x==12
                                                else 20
                                                )
    df['precipitation_amount'] = df['month'].map(lambda x:
                                                     46  if x==1
                                                else 60  if x==2
                                                else 102 if x==3
                                                else 134 if x==4
                                                else 139 if x==5
                                                else 206 if x==6
                                                else 157 if x==7
                                                else 95  if x==8
                                                else 172 if x==9
                                                else 108 if x==10
                                                else 65  if x==11
                                                else 34  if x==12
                                                else 100
                                                )
    df['cos_month'] = df['month'].map(lambda x: np.cos(x*30))
    df.drop(['date', 'month'], axis=1, inplace=True)

    return df

class_list = list(raw_train.dropna()['target'].drop_duplicates().values)
y_train = raw_train['target'].values
y_test = raw_test['target'].values

train = make_featureset(train_dict, gs_dict)
train['date'] = raw_train['date']
train[target] = y_train

test = make_featureset(test_dict, gs_dict)
test['date'] = raw_test['date']
test[target] = y_test
train = train.loc[train[target].dropna().index, :].dropna()
test= test.loc[test[target].dropna().index, :].dropna()
del raw_train, raw_test
train = feature_month(train)
test = feature_month(test)
try:
    train.drop('', axis=1, inplace=True)
except ValueError:
    pass
try:
    test.drop('', axis=1, inplace=True)
except ValueError:
    pass
gc.collect()

### ベクトル化したデータを可視化

In [None]:
# train.drop('date', axis=1, inplace=True)
# test.drop('date', axis=1, inplace=True)
print(train.shape)
display(train.head())
print(test.shape)
display(test.head())

### 機械学習による分類を実施

In [None]:
%load_ext autoreload
%autoreload 2
# Original Library
sys.path.append(f'{HOME}/kaggle/data_analysis/library/')
sys.path.append(f'{HOME}/kaggle/data_analysis')
from model.lightgbm_ex import lightgbm_ex as lgb_ex
from model.params_lgbm import train_params_nlp
from preprocessing import factorize_categoricals, get_dummies, ordinal_encode, get_ordinal_mapping

#========================================================================
# ML Args
#========================================================================
model_type = 'lgb'
params = train_params_nlp()
multiclass = True
# metric = 'auc'
metric = 'accuracy'
early_stopping_rounds = 50
num_boost_round = 1000
learning_rate = 0.1
fold=4
fold_type='stratified'
group_col_name=''
dummie=1
oof_flg=True
#  oof_flg=False
LGBM = lgb_ex(logger=logger, metric=metric, model_type=model_type, ignore_list=ignore_list)

#========================================================================
# Train & Prediction Start
#========================================================================
class_list = ['multiclass']

train_class = train[target].values
test_class = test[target].values
label_method = lambda x:1 if x==text_class else 0

#========================================================================
# 元コードから無理やり引っ張ったので無駄な処理多し
#========================================================================
for i, text_class in enumerate(class_list):
    params['objective'] = 'multiclass'
    params['num_class'] = 13
    del params['metric']
    LGBM.metric = 'accuracy'
    
    # クラスのラベル化
    ignore_list.remove(target)
    train[target] = train_class
    train, _, _ = LGBM.data_check(train=train, test=[], target=target, encode='ordinal')
    ignore_list.append(target)
    category_map = train[target].to_frame()
    category_map['text_class'] = train_class
    category_map = category_map.drop_duplicates().set_index(target).to_dict()['text_class']
    train[target] += -1 # ordinal encodeでclass設定してるので、1始まりになってる

    logger.info(f'''
#========================================================================
# Text Class: {text_class} Classifier Start!!
#========================================================================''')

    if len(test)==0:
        LGBM = LGBM.cross_validation(
            train=train
            ,key=''
            ,target=target
            ,fold_type=fold_type
            ,fold=fold
            ,group_col_name=group_col_name
            ,params=params
            ,num_boost_round=num_boost_round
            ,early_stopping_rounds=early_stopping_rounds
        )
    else:
        test[target] = test_class
        test = LGBM.decoder.fit_transform(test)
        test[target] += -1

        LGBM = LGBM.cross_prediction(
            train=train
            ,test=test
            ,key=key
            ,target=target
            ,fold_type=fold_type
            ,fold=fold
            ,group_col_name=group_col_name
            ,params=params
            ,num_boost_round=num_boost_round
            ,early_stopping_rounds=early_stopping_rounds
            ,oof_flg=oof_flg
        )

    LGBM.cv_feim.to_csv(f'../valid/{start_time[4:12]}_{model_type}_jrw_{text_class}_feat{len(LGBM.use_cols)}_CV{LGBM.cv_score}_lr{learning_rate}.csv', index=False)

    if params['objective']=='multiclass':
        break

#========================================================================
# Trainデータの目的変数と予測値を比較するDF
# Column: unique_id | target | prediction
#========================================================================
train_stack = LGBM.train_stack
train_stack[target] = train_stack[target].map(lambda x: category_map[x+1])
train_stack['prediction'] = train_stack['prediction'].map(lambda x: category_map[x+1])
train_stack.to_csv(f'../output/{start_time[:11]}_jrw_train_stack_CV{LGBM.cv_score}.csv', index=False)

In [None]:
from matplotlib import pyplot as plt
import japanize_matplotlib
%matplotlib inline
import seaborn as sns
plt.figure(figsize=(12, 16))
sns.barplot(data=LGBM.cv_feim.sort_values(by='avg_importance', ascending=False).iloc[:50, :], x='avg_importance', y='feature')
plt.show()
# viz.set_xticklabels(df_cnt['index'], rotation=90)