In [None]:
# レビュー値ではなく売り上げ予測の方向にシフト
# とりあえず回帰から
import math

import cols as cols
import pandas as pd
import numpy as np
import json
import ast
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
df1 = pd.read_csv("../movie/tmdb_5000_movies.csv")
df2 = pd.read_csv("../movie/tmdb_5000_credits.csv")
df3 = pd.merge(df1,df2,on="title")
df3.head()
#いらない場所を落とす
df3 = df3.drop(columns=['homepage', 'id',"original_title","overview","status","tagline","vote_count","movie_id","popularity","original_language","title","production_countries","keywords","cast"])
df3 = df3.dropna(axis=0,how="any")

# 収入が0のものを削除
df3 = df3.query("revenue >= 1000")
df3 = df3.dropna().reset_index(drop=True)
df3.shape

In [None]:
# 今回は監督を使いたいと思っているので、まずは映画の監督を抽出
crews = df3["crew"]
crews_list = list()
num = 0
for each_crews in crews:
    num += 1
    if each_crews == "[]":
        crews_list.append("")
    else:
        each_crews = ast.literal_eval(each_crews[1:-1])
        producer_list = list()
        for each_crew in each_crews:
            try:
                if each_crew["job"] == "Director":
                    crew = each_crew["name"]
                    producer_list.append(crew)
            except:
                crew = each_crews["name"]
                producer_list.append(crew)
        crews_list.append(producer_list)

#  ダブってるのがあるからなんとかする(重複解消)
for i in range(len(crews_list)):
    crews_list[i] = list(set(crews_list[i]))

# それをもとにどれだけ出てきたかをカウント
crews_dict = dict()
for crews in crews_list:
    for director in crews:
        if director in crews_dict:
            crews_dict[director] += 1
        else:
            crews_dict[director] = 1

# さっき作った映画の辞書をもとに収入を入れていく
df4 = df3.filter(["revenue","crew"])
df4["director_sales"] = 0

# とりあえず辞書を作ってそこに格納
# ここに各々の監督の作品の数で売り上げを割って平均の売り上げを求める
# 配列の左が合計値,右が足した数
director_sales = crews_dict
for key in crews_dict:
    director_sales[key] = [0,0]

sales = df4["revenue"]
for i in range(len(sales)):
    per_sales = sales.iloc[i]
    director = crews_list[i]
    for j in director:
        director_sales[j][0] += per_sales
        director_sales[j][1] += 1

# その後足した分だけ割っていく
new_director_sales = dict()
for key,value in director_sales.items():
    sum_sales = value[0]
    sum_num = value[1]
    new_director_sales[key] = sum_sales//sum_num

# 最後に特徴量を作成
director_sum_sales = list()
for directors in crews_list:
    num = 0
    for i in range(len(directors)):
        director = directors[i]
        num += new_director_sales[director]
    num = num//(i+1)
    director_sum_sales.append(num)

# 作成した特徴量をデータフレームに格納
df4["director_sales"] = director_sum_sales
df4 = df4.drop(columns=["revenue"])

# 最後に結合
df3 = pd.concat([df3, df4], axis=1,ignore_index=False)
df3 = df3.drop(columns=["crew"])
df3.head()

In [None]:
# 制作会社も過去の売上から特徴量を作成


In [None]:
# 話されている言語の数を特徴量として作成(話されている言語の種類を取得)
spoken_languages= df3["spoken_languages"]
language_list = list()
for spoken_language in spoken_languages:
    if spoken_language == "[]":
        language_list.append(1)
    else:
        spoken_language = ast.literal_eval(spoken_language[1:-1])
        language_type = len(spoken_language)
        language_list.append(language_type)

# 作成したものを代入
df3["spoken_languages"] = language_list
df3.head()

In [None]:
# リリース日を月と年月に分けてそれぞれ代入
# また月日は円循環なので,sinとcosに分ける

# まずは月日を1/1からどれだけ経ったかに変換する関数を作成
def convert_date_to_number(month: int, day: int) -> int:
    # 1月1日からの経過日数を保持する変数
    elapsed_days = 0

    # 1月から現在の月までをループ
    for m in range(1, month):
        # 当該月の日数を加算
        elapsed_days += get_days_in_month(m)

    # 最後に現在日を加算
    elapsed_days += day

    return elapsed_days

def get_days_in_month(month: int) -> int:
    # 2月までは日数が固定
    if month in [1, 3, 5, 7, 8, 10, 12]:
        return 31
    elif month == 2:
        return 28
    else:
        return 30

# 次にsinとcosに変換する関数を作成
def encode(month, day):
    # この方法だと場合によって最大値が変化するデータでは正確な値は出ない
    # 例：月の日数が30日や31日の場合がある
    date_num = convert_date_to_number(month, day)
    sin = np.sin(2 * np.pi * date_num / 365)
    cos = np.cos(2 * np.pi * date_num / 365)
    return [sin,cos]

# 新しい特徴量を作成
df3["release_year"] = 0
df3["release_day_sin"] = 0
df3["release_day_cos"] = 0
release_year = dict()
release_day_sin = dict()
release_day_cos = dict()
for key in df3["release_date"]:
    if type(key) == str:
        year,month,day = key.split("-")
        release_year[key] = float(year)
        day = encode(int(month),int(day))
        day_sin,day_cos = day[0],day[1]
        release_day_sin[key] = day_sin
        release_day_cos[key] = day_cos

# では作ったものを入れていく
df3["release_year"] = df3["release_date"].replace(release_year).astype(float)
df3["release_day_cos"] = df3["release_date"].replace(release_day_cos).astype(float)
df3["release_day_sin"] = df3["release_date"].replace(release_day_sin).astype(float)
df3 = df3.drop(columns=["release_date"])
df3.head()

In [None]:
# ジャンルをワンホットエンコーディング
genres_list = ["Action","Adventure","Fantasy","Science Fiction","Crime","Drama","Thriller","Animation","Family","Western","Comedy","Romance","Horror","Mystery","History","War","Music","Documentary","Foreign","TV Movie"]

# とりあえず特徴量を作成
for genre in genres_list:
    df3[genre] = 0
df3.shape

genres = df3["genres"]
# とりあえず種類が欲しいので種類を確認
# まずは辞書を作成
movie_genre_dict = dict()
movie_genre_list = list()

for movie_genre in genres:
    if movie_genre == "[]":
        movie_genre_list.append("")
    else:
        per_genre_list = ast.literal_eval(movie_genre[1:-1])
        genre_list = []
        for per_genre in per_genre_list:
            if type(per_genre) == dict:
                movie_id = per_genre["id"]
                movie_name = per_genre["name"]
                movie_genre_dict[movie_id] = movie_name
                genre_list.append(movie_name)
            else:
                movie_id = per_genre_list["id"]
                movie_name = per_genre_list["name"]
                movie_genre_dict[movie_id] = movie_name
                genre_list.append(movie_name)
                break
        movie_genre_list.append(genre_list)
        genre_list = []

def put_genre(df,movie_genre_list):
    for i in range(len(df)):
        for genre in movie_genre_list[i]:
            df.at[i,genre] = 1.0
put_genre(df3,movie_genre_list)


# では代入開始
def put_genre(df,movie_genre_list):
    
    for i,row in df.iterrows():
        num = 0
        for genre in movie_genre_list[num]:
            df.at[i,genre] = 1.0
            num += 1
put_genre(df3,movie_genre_list)

# 最後に不必要な部分を落とす
df3 = df3.drop(columns=["genres"])

In [None]:
df3.head()

In [None]:
# 目的変数を設定
df3_y = df3["revenue"]
df3_x = df3.drop(columns=["revenue"])

In [None]:
# 学習データとテストデータに分割する
X_train, X_test, y_train, y_test = train_test_split(df3_x, df3_y, test_size=0.3, random_state=1)

In [None]:
# 学習に使用するデータを設定
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [None]:
# LightGBM パラメータ
params = {'task': 'train',
          'boosting_type': 'gbdt',
          'objective': 'regression',
          'metric': 'rmse',
          'early_stopping_rounds':100,
          'n_estimators':10000
          }

In [None]:
# モデルの学習
model = lgb.train(params=params,
                  train_set=lgb_train,
                  valid_sets=lgb_eval
                 )

# 学習データの予測
y_train_pred = model.predict(X_train)

# テストデータの予測
y_test_pred = model.predict(X_test)

In [None]:
# 散布図を描画(真値 vs 予測値)
x_test_min = min(min(y_test), min(y_test_pred))
x_test_max = max(max(y_test), max(y_test_pred))
x_test = np.linspace(x_test_min, x_test_max)
# 予測=実測の線の定義
plt.plot(x_test, x_test, color="gray", label="y=x", linestyle="--")
# グラフデータの定義
plt.scatter(y_test, y_test_pred, label='test_data', color="b") # 散布図のプロット
plt.xlabel('revenue') # x軸ラベル
plt.ylabel('predicted_revenue') # y軸ラベル
plt.title('revenue_predicition') # グラフタイトル
plt.xlim(x_test_min-0.05*(x_test_max - x_test_min), x_test_max+0.05*(x_test_max - x_test_min))
plt.ylim(x_test_min-0.05*(x_test_max - x_test_min), x_test_max+0.05*(x_test_max - x_test_min))
plt.grid()
plt.legend()

In [None]:
mse = mean_squared_error(y_test, y_test_pred) # MSE(平均二乗誤差)の算出
rmse = np.sqrt(mse) # RSME = √MSEの算出
print('RMSE :',rmse)

#r2 : 決定係数
r2 = r2_score(y_test, y_test_pred)
print('R2 :',r2)

In [None]:
# 学習データとテストデータに分割する
X_train, X_test, y_train, y_test = train_test_split(df3_x, df3_y, test_size=0.3, random_state=1)

In [None]:
# 学習に使用するデータを設定
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [None]:
# LightGBM パラメータ
params = {'task': 'train',
          'boosting_type': 'gbdt',
          'objective': 'regression',
          'metric': 'rmse',
          'early_stopping_rounds':100,
          'n_estimators':10000
          }

In [None]:
# モデルの学習
model = lgb.train(params=params,
                  train_set=lgb_train,
                  valid_sets=lgb_eval
                 )

# 学習データの予測
y_train_pred = model.predict(X_train)

# テストデータの予測
y_test_pred = model.predict(X_test)

In [None]:
# 散布図を描画(真値 vs 予測値)
x_test_min = min(min(y_test), min(y_test_pred))
x_test_max = max(max(y_test), max(y_test_pred))
x_test = np.linspace(x_test_min, x_test_max)
# 予測=実測の線の定義
plt.plot(x_test, x_test, color="gray", label="y=x", linestyle="--")
# グラフデータの定義
plt.scatter(y_test, y_test_pred, label='test_data', color="b") # 散布図のプロット
plt.xlabel('revenue') # x軸ラベル
plt.ylabel('predicted_revenue') # y軸ラベル
plt.title('revenue_predicition') # グラフタイトル
plt.xlim(x_test_min-0.05*(x_test_max - x_test_min), x_test_max+0.05*(x_test_max - x_test_min))
plt.ylim(x_test_min-0.05*(x_test_max - x_test_min), x_test_max+0.05*(x_test_max - x_test_min))
plt.grid()
plt.legend()

In [None]:
mse = mean_squared_error(y_test, y_test_pred) # MSE(平均二乗誤差)の算出
rmse = np.sqrt(mse) # RSME = √MSEの算出
print('RMSE :',rmse)

#r2 : 決定係数
r2 = r2_score(y_test, y_test_pred)
print('R2 :',r2)

In [None]:
# 特徴量重要度を棒グラフでプロットする関数
def plot_feature_importance(df):
    n_features = len(df)                              # 特徴量数(説明変数の個数)
    df_plot = df.sort_values('importance')            # df_importanceをプロット用に特徴量重要度を昇順ソート
    f_importance_plot = df_plot['importance'].values  # 特徴量重要度の取得
    plt.barh(range(n_features), f_importance_plot, align='center')
    cols_plot = df_plot['feature'].values             # 特徴量の取得
    plt.yticks(np.arange(n_features), cols_plot)      # x軸,y軸の値の設定
    plt.xlabel('Feature importance')                  # x軸のタイトル
    plt.ylabel('Feature')                             # y軸のタイトル

In [1]:
f_importance = np.array(model.feature_importance()) # 特徴量重要度の算出
f_importance = f_importance / np.sum(f_importance)  # 正規化(必要ない場合はコメントアウト)
df_importance = pd.DataFrame({'feature':cols, 'importance':f_importance})
df_importance = df_importance.sort_values('importance', ascending=False) # 降順ソート
display(df_importance)

NameError: name 'np' is not defined

In [None]:
# 特徴量重要度の可視化
plot_feature_importance(df_importance)