In [3]:
import math

import pandas as pd
import numpy as np
import json
import ast
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
import matplotlib.pyplot as plt # グラフ描画用
from sklearn.metrics import accuracy_score

In [None]:
# 今回使う説明変数を決める
x_col = ["genres","original_language","production_companies","release_date","runtime","vote_average"]
df = pd.read_csv("movie/tmdb_5000_movies.csv")
df_x = df[x_col]

In [None]:
df_x.head()

In [None]:
#ではここから全てをfloat型に変えていく
genres = df_x["genres"]
# とりあえず種類が欲しいので種類を確認
# まずは辞書を作成
movie_genre_dict = dict()
movie_genre_list = list()

for movie_genre in genres:
    if movie_genre == "[]":
        movie_genre_list.append("")
    else:
        per_genre_list = ast.literal_eval(movie_genre[1:-1])
        genre_list = []
        for per_genre in per_genre_list:
            if type(per_genre) == dict:
                movie_id = per_genre["id"]
                movie_name = per_genre["name"]
                movie_genre_dict[movie_id] = movie_name
                genre_list.append(movie_name)
            else:
                movie_id = per_genre_list["id"]
                movie_name = per_genre_list["name"]
                movie_genre_dict[movie_id] = movie_name
                genre_list.append(movie_name)
                break
        movie_genre_list.append(genre_list)
        genre_list = []

In [None]:
print(len(genres))
print(len(movie_genre_list))

In [None]:
# 20種類のジャンルがあることがわかった
# ここからワンホットエンコーディングを行う

In [None]:
genres_list = ["Action","Adventure","Fantasy","Science Fiction","Crime","Drama","Thriller","Animation","Family","Western","Comedy","Romance","Horror","Mystery","History","War","Music","Documentary","Foreign","TV Movie"]

In [None]:
a = [0]*4803*20
df_genre = pd.DataFrame(np.array(a).reshape(4803, 20),
                  columns=genres_list)
df_genre.head()

In [None]:
movie_genre_list

In [None]:
# df.at[0,"Action"]とすると任意の値を取得できる
def put_genre(df,movie_genre_list):
    for i in range(len(df)):
        for genre in movie_genre_list[i]:
            df.at[i,genre] = 1.0
put_genre(df_genre,movie_genre_list)

In [None]:
df_genre.tail()

In [None]:
language = df["original_language"]
language.value_counts()

In [None]:
# 映画が造られた国をラベルエンコーディング
le = LabelEncoder()

In [None]:
df_x["original_language"] = le.fit_transform(df_x["original_language"])

In [None]:
df_x.info()

In [None]:
df_x.drop(columns="genres")
df_x.info()

In [None]:
# 次はリリース日を数値型へと変換していく
# また今回は月と年数に分ける
df["release_date"].value_counts()

In [None]:
# 変換するための辞書を作成していく
release_year = dict()
release_month = dict()
for key in df["release_date"]:
    if type(key) == str:
        year,month,day = key.split("-")
        release_year[key] = float(year)
        release_month[key] = float(month)

In [None]:
df_x["release_year"] = df_x["release_date"].replace(release_year).astype(float)
df_x["release_month"] = df_x["release_date"].replace(release_month).astype(float)
df_x.info()

In [None]:
# 制作会社をラベルエンコーディング
production_companies = df_x["production_companies"]

In [None]:
# まずは空のものがいくつあるのかを確認
empty_list = list()
companies_dict = dict()
companies_list = list()
for i,companies in enumerate(production_companies):
    if companies == "[]":
        empty_list.append(i)
        companies_list.append("")
    else:
        companies = ast.literal_eval(companies[1:-1])
        append_list = []
        for j,company in enumerate(companies):
            if type(company) == dict:
                name = company["name"]
                if name in companies_dict:
                    companies_dict[name] += 1
                else:
                    companies_dict[name] = 1
                append_list.append(name)
            else:
                name = companies["name"]
                if name in companies_dict:
                    companies_dict[name] += 1
                else:
                    companies_dict[name] = 1
                append_list.append(name)
    companies_list.append(append_list)
    append_list = []
companies_list

In [None]:
production_companies[0]

In [None]:
# 流石に会社が多すぎるため,1つのものは消していきたい
new_companies_dict = dict()
new_companies_list = list()
for key,value in companies_dict.items():
    if value <= 50:
        continue
    else:
        new_companies_dict[key] = value
        new_companies_list.append(key)
len(new_companies_dict)

In [None]:
new_companies_dict

In [None]:
# では配給会社をもとにデータフレームを構築する
a = [0]*4803*25
df_companies = pd.DataFrame(np.array(a).reshape(4803, 25),
                  columns=new_companies_list)
df_companies.head()

In [None]:
# 作成したデータフレームに値を格納していく
def put_companies(df,companies_list):
    for i in range(len(df)):
        for company in companies_list[i]:
            if company in df.columns.values:
                df.at[i,company] = 1.0
put_companies(df_companies,companies_list)
df_companies

In [None]:
df_x.info()

In [None]:
df_genre.info()

In [None]:
df_companies.info()

In [None]:
df_x = df_x.drop(columns=["genres","production_companies","release_date",])

In [None]:
df_x.info()

In [None]:
df_x = pd.concat([df_x,df_companies,df_genre],axis=1)

In [None]:
df_x.info()

In [None]:
# まずは欠損値があるものを削除していく(今回はもう欠損値処理せずに消す)
df_x = df_x.dropna(axis=0,how="any")
df_x.info()

In [None]:
df_x["vote_average"].nunique()

In [None]:
# とりあえずテストデータと検証データに分ける

# 説明変数,目的変数
X = df_x.drop("vote_average",axis=1).values # 説明変数(target以外の特徴量)
y = df_x["vote_average"].values # 目的変数(target)

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=2)


lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# LightGBM parameters
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression', # 目的 : 回帰
        'metric': "mae", # 評価指標 : 誤り率(= 1-正答率)
}

In [1]:
# モデルの学習
model = lgb.train(params,
train_set=lgb_train, # トレーニングデータの指定
valid_sets=lgb_eval, # 検証データの指定
)

# テストデータの予測 ((各クラスの予測確率 [クラス0の予測確率,クラス1の予測確率,クラス2の予測確率] を返す))
y_pred = model.predict(X_test) #目的変数の予測値が帰ってくる

NameError: name 'lgb' is not defined

In [None]:
# テストデータにモデルを取り入れる

# 真値と予測値の表示
df_pred = pd.DataFrame({'CRIM':y_test,'CRIM_pred':y_pred})
display(df_pred)

# 散布図を描画(真値 vs 予測値)
plt.plot(y_test, y_test, color = 'red', label = 'x=y') # 直線y = x (真値と予測値が同じ場合は直線状に点がプロットされる)
plt.scatter(y_test, y_pred) # 散布図のプロット
plt.xlabel('y') # x軸ラベル
plt.ylabel('y_test') # y軸ラベル
plt.title('y vs y_pred') # グラフタイトル

In [None]:
# モデル評価
# rmse : 平均二乗誤差の平方根
mse = mean_squared_error(y_test, y_pred) # MSE(平均二乗誤差)の算出
rmse = np.sqrt(mse) # RSME = √MSEの算出
print('RMSE :',rmse)

#r2 : 決定係数
r2 = r2_score(y_test,y_pred)
print('R2 :',r2)