## 必要なライブラリをインポート

In [None]:
import sys
import os
import time
import pandas as pd
import MeCab as mc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge,Lasso,ElasticNet
import matplotlib.pyplot as pyp
from sklearn.linear_model import RidgeCV,LassoCV,ElasticNetCV,RidgeClassifierCV
from sklearn.model_selection import GridSearchCV
import numpy as np
from operator import itemgetter

## データの取得

In [None]:
review_details = pd.read_csv("df.csv")
review_details.head()

## 形態素解析
### タイトルはコメントの要約に過ぎないので今回はコメントの分析だけ行う

In [None]:
def mecab_analysis(text):
    word_sequence = str()
    try:
        mecab = mc.Tagger('/usr/local/lib/mecab/dic/mecab-ipadic-neologd')
        
        mecab.parse('')#文字列がGCされるのを防ぐ
        node = mecab.parseToNode(text)  
        
        while node:
            #単語を取得
            word = node.surface
            #品詞を取得
            pos = node.feature.split(",")[1]
            if pos in ["一般","固有名詞","サ変接続","形容詞"]:
                word_sequence = word_sequence + word+" "
            #次の単語に進める
            node = node.next
        return word_sequence
    except:
        return word_sequence

review_details["comment_mecab"] = review_details["comment"].apply(lambda x:mecab_analysis(x))
#review_details["title_mecab"] = review_details["title"].apply(lambda x:mecab_analysis(x))

In [None]:
review_details.head()

## tf-idfでベクトル化

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_reviews = review_details["comment_mecab"]
reviews_vectorizer = TfidfVectorizer(use_idf=True,token_pattern=u'(?u)\\b\\w+\\b')
reviews_vecs = reviews_vectorizer.fit_transform(tfidf_reviews)
reviews_vecs_df = pd.DataFrame(reviews_vecs.toarray())
concat_df = pd.concat([review_details,reviews_vecs_df],axis = 1)
drop_col = ["comment","title","comment_mecab"]
for_reg_df = concat_df.drop(drop_col,axis=1)
for_reg_df.head()

## 機械学習モデル構築

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge,Lasso,ElasticNet
import matplotlib.pyplot as pyp
from sklearn.linear_model import RidgeCV,LassoCV,ElasticNetCV,RidgeClassifierCV
from sklearn.model_selection import GridSearchCV

In [None]:
x_train, x_val, y_train, y_val = train_test_split(for_reg_df.iloc[:,1:],for_reg_df["score"], train_size=0.8, random_state=1)

In [None]:
ridge_clf = RidgeCV().fit(x_train,y_train)

In [None]:
lasso_clf = LassoCV(cv=10).fit(x_train,y_train)

In [None]:
param_grid = {'alpha':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],'l1_ratio':[0.1,0.2,0.3,0.5,0.7,0.9]}
elastic_grid = GridSearchCV(ElasticNet(),param_grid=param_grid)
elastic_grid.fit(x_train,y_train)
elastic_grid.best_params_

In [None]:
elastic_clf = ElasticNet(alpha=0.1,l1_ratio=0.1).fit(x_train,y_train)

In [None]:
ridge_pred =ridge_clf.predict(x_val)
lasso_pred=lasso_clf.predict(x_val)
elastic_pred = elastic_clf.predict(x_val)

In [None]:
print("\nLassoでの係数")
print(lasso_clf.intercept_) 
print(lasso_clf.coef_)

In [None]:
print("\nridge_RMS")
ridge_RMS = np.mean((ridge_pred - y_val) **2)
print(ridge_RMS)

In [None]:
print("\nRidgeでの係数")
print(ridge_clf.intercept_) 
print(ridge_clf.coef_)

In [None]:
print("\nlasso_RMS")
lasso_RMS = np.mean((lasso_pred - y_val) **2)
print(lasso_RMS)

In [None]:
print("\nElasticNetでの係数")
print(elastic_clf.intercept_) 
print(elastic_clf.coef_)

In [None]:
print("\nelasticnet_RMS")
elastic_RMS = np.mean((elastic_pred - y_val) **2)
print(elastic_RMS)


## 平均二乗誤差が小さいRidge回帰を採用

In [None]:
np.argmax(ridge_clf.coef_)

In [None]:
np.where(ridge_clf.coef_ > 0.9)

In [None]:
#リストから参照する方法
#https://qiita.com/supersaiakujin/items/d63c73bb7b5aac43898a

In [None]:
def get_keyword(idx):
    print(np.array(list(reviews_vectorizer.vocabulary_.items()))[np.array(list(reviews_vectorizer.vocabulary_.items()))[:,1]==idx])

In [None]:
get_keyword('1416')