In [5]:
import math

import pandas as pd
import numpy as np
import json
import ast
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
import matplotlib.pyplot as plt # グラフ描画用
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score # モデル評価用(決定係数)

In [6]:
# 今回使う説明変数を決める
x_col = ["genres","original_language","production_companies","release_date","runtime","vote_average"]
df = pd.read_csv("movie/tmdb_5000_movies.csv")
df_x = df[x_col]

In [7]:
df_x.head()

Unnamed: 0,genres,original_language,production_companies,release_date,runtime,vote_average
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",en,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",2009-12-10,162.0,7.2
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",en,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",2007-05-19,169.0,6.9
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",en,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",2015-10-26,148.0,6.3
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",en,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",2012-07-16,165.0,7.6
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",en,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",2012-03-07,132.0,6.1


In [8]:
#ではここから全てをfloat型に変えていく
genres = df_x["genres"]
# とりあえず種類が欲しいので種類を確認
# まずは辞書を作成
movie_genre_dict = dict()
movie_genre_list = list()

for movie_genre in genres:
    if movie_genre == "[]":
        movie_genre_list.append("")
    else:
        per_genre_list = ast.literal_eval(movie_genre[1:-1])
        genre_list = []
        for per_genre in per_genre_list:
            if type(per_genre) == dict:
                movie_id = per_genre["id"]
                movie_name = per_genre["name"]
                movie_genre_dict[movie_id] = movie_name
                genre_list.append(movie_name)
            else:
                movie_id = per_genre_list["id"]
                movie_name = per_genre_list["name"]
                movie_genre_dict[movie_id] = movie_name
                genre_list.append(movie_name)
                break
        movie_genre_list.append(genre_list)
        genre_list = []

In [9]:
print(len(genres))
print(len(movie_genre_list))

4803
4803


In [10]:
# 20種類のジャンルがあることがわかった
# ここからワンホットエンコーディングを行う

In [11]:
genres_list = ["Action","Adventure","Fantasy","Science Fiction","Crime","Drama","Thriller","Animation","Family","Western","Comedy","Romance","Horror","Mystery","History","War","Music","Documentary","Foreign","TV Movie"]

In [12]:
a = [0]*4803*20
df_genre = pd.DataFrame(np.array(a).reshape(4803, 20),
                  columns=genres_list)
df_genre.head()

Unnamed: 0,Action,Adventure,Fantasy,Science Fiction,Crime,Drama,Thriller,Animation,Family,Western,Comedy,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV Movie
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
movie_genre_list

[['Action', 'Adventure', 'Fantasy', 'Science Fiction'],
 ['Adventure', 'Fantasy', 'Action'],
 ['Action', 'Adventure', 'Crime'],
 ['Action', 'Crime', 'Drama', 'Thriller'],
 ['Action', 'Adventure', 'Science Fiction'],
 ['Fantasy', 'Action', 'Adventure'],
 ['Animation', 'Family'],
 ['Action', 'Adventure', 'Science Fiction'],
 ['Adventure', 'Fantasy', 'Family'],
 ['Action', 'Adventure', 'Fantasy'],
 ['Adventure', 'Fantasy', 'Action', 'Science Fiction'],
 ['Adventure', 'Action', 'Thriller', 'Crime'],
 ['Adventure', 'Fantasy', 'Action'],
 ['Action', 'Adventure', 'Western'],
 ['Action', 'Adventure', 'Fantasy', 'Science Fiction'],
 ['Adventure', 'Family', 'Fantasy'],
 ['Science Fiction', 'Action', 'Adventure'],
 ['Adventure', 'Action', 'Fantasy'],
 ['Action', 'Comedy', 'Science Fiction'],
 ['Action', 'Adventure', 'Fantasy'],
 ['Action', 'Adventure', 'Fantasy'],
 ['Action', 'Adventure'],
 ['Adventure', 'Fantasy'],
 ['Adventure', 'Fantasy'],
 ['Adventure', 'Drama', 'Action'],
 ['Drama', 'Romance

In [14]:
# df.at[0,"Action"]とすると任意の値を取得できる
def put_genre(df,movie_genre_list):
    for i in range(len(df)):
        for genre in movie_genre_list[i]:
            df.at[i,genre] = 1.0
put_genre(df_genre,movie_genre_list)

In [15]:
df_genre.tail()

Unnamed: 0,Action,Adventure,Fantasy,Science Fiction,Crime,Drama,Thriller,Animation,Family,Western,Comedy,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV Movie
4798,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4799,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0
4800,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1
4801,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4802,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [16]:
language = df["original_language"]
language.value_counts()

en    4505
fr      70
es      32
zh      27
de      27
hi      19
ja      16
it      14
cn      12
ru      11
ko      11
pt       9
da       7
sv       5
nl       4
fa       4
th       3
he       3
ta       2
cs       2
ro       2
id       2
ar       2
vi       1
sl       1
ps       1
no       1
ky       1
hu       1
pl       1
af       1
nb       1
tr       1
is       1
xx       1
te       1
el       1
Name: original_language, dtype: int64

In [17]:
# 映画が造られた国をラベルエンコーディング
le = LabelEncoder()

In [18]:
df_x["original_language"] = le.fit_transform(df_x["original_language"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x["original_language"] = le.fit_transform(df_x["original_language"])


In [19]:
df_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   genres                4803 non-null   object 
 1   original_language     4803 non-null   int64  
 2   production_companies  4803 non-null   object 
 3   release_date          4802 non-null   object 
 4   runtime               4801 non-null   float64
 5   vote_average          4803 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 225.3+ KB


In [20]:
df_x.drop(columns="genres")
df_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   genres                4803 non-null   object 
 1   original_language     4803 non-null   int64  
 2   production_companies  4803 non-null   object 
 3   release_date          4802 non-null   object 
 4   runtime               4801 non-null   float64
 5   vote_average          4803 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 225.3+ KB


In [21]:
# 次はリリース日を数値型へと変換していく
# また今回は月と年数に分ける
df["release_date"].value_counts()

2006-01-01    10
2002-01-01     8
2004-09-03     7
1999-10-22     7
2013-07-18     7
              ..
2002-12-30     1
2002-08-20     1
1987-11-05     1
2004-11-11     1
2012-05-03     1
Name: release_date, Length: 3280, dtype: int64

In [22]:
# 変換するための辞書を作成していく
release_year = dict()
release_month = dict()
for key in df["release_date"]:
    if type(key) == str:
        year,month,day = key.split("-")
        release_year[key] = float(year)
        release_month[key] = float(month)

In [23]:
df_x["release_year"] = df_x["release_date"].replace(release_year).astype(float)
df_x["release_month"] = df_x["release_date"].replace(release_month).astype(float)
df_x.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x["release_year"] = df_x["release_date"].replace(release_year).astype(float)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   genres                4803 non-null   object 
 1   original_language     4803 non-null   int64  
 2   production_companies  4803 non-null   object 
 3   release_date          4802 non-null   object 
 4   runtime               4801 non-null   float64
 5   vote_average          4803 non-null   float64
 6   release_year          4802 non-null   float64
 7   release_month         4802 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 300.3+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x["release_month"] = df_x["release_date"].replace(release_month).astype(float)


In [27]:
#0.0という評価値を弾く
reviews = df_x["vote_average"]
review_dict = dict()
for review in reviews:
    if review <= 2:
        review_dict[review] = np.nan
    else:
        review_point = round(review, 1)
        review_dict[review] = review

# 上記の数値を埋め込む
df_x["vote_average"] = df_x["vote_average"].replace(review_dict).astype(float)
df_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   genres                4803 non-null   object 
 1   original_language     4803 non-null   int64  
 2   production_companies  4803 non-null   object 
 3   release_date          4802 non-null   object 
 4   runtime               4801 non-null   float64
 5   vote_average          4740 non-null   float64
 6   release_year          4802 non-null   float64
 7   release_month         4802 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 300.3+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x["vote_average"] = df_x["vote_average"].replace(review_dict).astype(float)


In [None]:
# 制作会社をラベルエンコーディング
production_companies = df_x["production_companies"]

In [None]:
# まずは空のものがいくつあるのかを確認
empty_list = list()
companies_dict = dict()
companies_list = list()
for i,companies in enumerate(production_companies):
    if companies == "[]":
        empty_list.append(i)
        companies_list.append("")
    else:
        companies = ast.literal_eval(companies[1:-1])
        append_list = []
        for j,company in enumerate(companies):
            if type(company) == dict:
                name = company["name"]
                if name in companies_dict:
                    companies_dict[name] += 1
                else:
                    companies_dict[name] = 1
                append_list.append(name)
            else:
                name = companies["name"]
                if name in companies_dict:
                    companies_dict[name] += 1
                else:
                    companies_dict[name] = 1
                append_list.append(name)
    companies_list.append(append_list)
    append_list = []
companies_list

In [None]:
production_companies[0]

In [None]:
# 流石に会社が多すぎるため,1つのものは消していきたい
new_companies_dict = dict()
new_companies_list = list()
for key,value in companies_dict.items():
    if value <= 50:
        continue
    else:
        new_companies_dict[key] = value
        new_companies_list.append(key)
len(new_companies_dict)

In [None]:
new_companies_dict

In [None]:
# では配給会社をもとにデータフレームを構築する
a = [0]*4803*25
df_companies = pd.DataFrame(np.array(a).reshape(4803, 25),
                  columns=new_companies_list)
df_companies.head()

In [None]:
# 作成したデータフレームに値を格納していく
def put_companies(df,companies_list):
    for i in range(len(df)):
        for company in companies_list[i]:
            if company in df.columns.values:
                df.at[i,company] = 1.0
put_companies(df_companies,companies_list)
df_companies

In [None]:
df_x.info()

In [None]:
df_genre.info()

In [None]:
df_companies.info()

In [None]:
df_x = df_x.drop(columns=["genres","production_companies","release_date",])

In [None]:
df_x.info()

In [None]:
df_x = pd.concat([df_x,df_companies,df_genre],axis=1)

In [None]:
df_x.info()

In [None]:
# まずは欠損値があるものを削除していく(今回はもう欠損値処理せずに消す)
df_x = df_x.dropna(axis=0,how="any")
df_x.info()

In [None]:
df_x["vote_average"].nunique()

In [None]:
# とりあえずテストデータと検証データに分ける

# 説明変数,目的変数
X = df_x.drop("vote_average",axis=1).values # 説明変数(target以外の特徴量)
y = df_x["vote_average"].values # 目的変数(target)

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=2)


lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# LightGBM parameters
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression', # 目的 : 回帰
        'metric': "mae", # 評価指標 : 誤り率(= 1-正答率)
}

In [1]:
# モデルの学習
model = lgb.train(params,
train_set=lgb_train, # トレーニングデータの指定
valid_sets=lgb_eval, # 検証データの指定
)

# テストデータの予測 ((各クラスの予測確率 [クラス0の予測確率,クラス1の予測確率,クラス2の予測確率] を返す))
y_pred = model.predict(X_test) #目的変数の予測値が帰ってくる

NameError: name 'lgb' is not defined

In [None]:
# テストデータにモデルを取り入れる

# 真値と予測値の表示
df_pred = pd.DataFrame({'CRIM':y_test,'CRIM_pred':y_pred})
display(df_pred)

# 散布図を描画(真値 vs 予測値)
plt.plot(y_test, y_test, color = 'red', label = 'x=y') # 直線y = x (真値と予測値が同じ場合は直線状に点がプロットされる)
plt.scatter(y_test, y_pred) # 散布図のプロット
plt.xlabel('y') # x軸ラベル
plt.ylabel('y_test') # y軸ラベル
plt.title('y vs y_pred') # グラフタイトル

In [None]:
# モデル評価
# rmse : 平均二乗誤差の平方根
mse = mean_squared_error(y_test, y_pred) # MSE(平均二乗誤差)の算出
rmse = np.sqrt(mse) # RSME = √MSEの算出
print('RMSE :',rmse)

#r2 : 決定係数
r2 = r2_score(y_test,y_pred)
print('R2 :',r2)