In [76]:
import pandas as pd
import numpy as np
import re
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import optuna
import pickle
import json
import gspread
from google.oauth2.service_account import Credentials
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe

In [77]:
## Googleスプレッドシートからデータ取得し格納
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from gspread_dataframe import get_as_dataframe
scope =['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name('grspread_key_mugisake.json', scope)
client = gspread.authorize(creds)
sheet = client.open("Mugisake_KA01").worksheet('sheet2')
df = get_as_dataframe(sheet)

In [78]:
#前処理
if_null_dell_column = ["name","price","address","layout","age","area","traffic_tx"]
df = df.dropna(subset=if_null_dell_column,axis = 0)

def get_station_info(item):
  dict = {}   
  if re.findall('バス',item) == ['バス']:
    dict["station"] = item.split('「')[1].split('」')[0]
    dict["time_walk"] = item.split('」')[1].split('歩')[1].split('分')[0]
    dict["time_bus"] = item.split('」')[1].split('バス')[1].split('分')[0]
  elif re.findall('km',item) == ['km']:
    dict["station"] = item.split('「')[1].split('」')[0]
    dict["time_walk"] = 0
    dict["time_bus"] = float(item.split('」')[1].split('車')[1].split('km')[0])/20*60
  else:
    dict["station"] = item.split('「')[1].split('」')[0]
    dict["time_walk"] = item.split('」')[1].split('歩')[1].split('分')[0]
    dict["time_bus"] = 0
  return dict

df["train_station"] = df["traffic_tx"].apply(lambda x :get_station_info(x)["station"])
df["time_walk"] = df["traffic_tx"].apply(lambda x :get_station_info(x)["time_walk"]).astype(float)
df["time_bus"] = df["traffic_tx"].apply(lambda x :get_station_info(x)["time_bus"]).astype(float)
df["age"] = df["age"].str[:4].replace("-","0").astype(float)
df["reform"] = df["reform"].str[:4].replace("-","0").astype(float)

def cal_room_num(text,s=0.5):
    if text.find("ワンルーム") >= 0:
        if text.find("S") >= 0:
            return 1+s
        else:
            return 1
    if text.find("1K") >= 0:
        if text.find("S") >= 0:
            return 1+s
        else:
            return 1
    if text.find("S") >=0:
        return int(re.sub(r'[^0-9]','',text))+1+s

    if text.find("2S") >=0:
        return int(re.sub(r'[^0-9]','',text))+2+s

    if text.find("3S") >=0:
        return int(re.sub(r'[^0-9]','',text))+3+s

    else:
        return int(re.sub(r'[^0-9]','',text))+1

df["room_num"] = df["layout"].apply(lambda x :cal_room_num(x))
def land_price(text):
    if text.find("秋葉原")>=0:
        return 742.9
    if text.find("新御徒町")>=0:
        return 441
    if text.find("浅草")>=0:
        return 390.3
    if text.find("本所吾妻橋")>=0:
        return 90
    if text.find("蔵前")>=0:
        return 103
    if text.find("南千住")>=0:
        return 218.6
    if text.find("北千住")>=0:
        return 238.7
    if text.find("青井")>=0:
        return 164.4
    if text.find("六町")>=0:
        return 127.8
    if text.find("八潮")>=0:
        return 77.8
    if text.find("三郷中央")>=0:
        return 50.9
    if text.find("南流山")>=0:
        return 62.9
    if text.find("流山セントラルパーク")>=0:
        return 60.1
    if text.find("流山おおたかの森")>=0:
        return 81
    if text.find("柏の葉キャンパス")>=0:
        return 60.1
    if text.find("柏たなか")>=0:
        return 56.5
    if text.find("守谷")>=0:
        return 41.2
df["train_station_price"] = df["train_station"].apply(lambda x: land_price(x)).astype(float)

def direction_price(text):
    if text.find("南")>=0:
        return 1
    if text.find("東")>=0:
        return 0.95
    if text.find("西")>=0:
        return 0.95
    if text.find("北")>=0:
        return 0.90
df["direction_price"] = df["direction"].apply(lambda x: direction_price(x)).astype(float)

#不要列の削除
drop_column = ['Unnamed: 19','Unnamed: 20','Unnamed: 21','Unnamed: 22','Unnamed: 23','Unnamed: 24','Unnamed: 25']
df = df.drop(drop_column,axis = 1)

In [79]:
#ランダムフォレストのモデル作成＆評価
x = df.loc[:,['price_kanri','price_tsumitate','age','floor','time_walk','time_bus','room_num','train_station_price','direction_price']]
y = df['price']
x_train, x_test, y_train, y_test = train_test_split(x,y)
model_random = RandomForestRegressor(n_estimators = 80, random_state = 0)
model_random.fit(x_train, y_train)
y_pred = model_random.predict(x_test)
print(model_random.score(x_train, y_train))
print(model_random.score(x_test, y_test))
mse = mean_squared_error(y_test, y_pred)
print(f'Test MSE: {mse}')

0.963073570158355
0.8371898444224384
Test MSE: 46550483629033.46


In [80]:
#optunaでランダムフォレストのハイパーパラメータ調整
def objective(trial):
    # ハイパーパラメータの設定
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)

    #ランダムフォレストモデル
    model_random_optuna = RandomForestRegressor(n_estimators = 80, random_state = 0)

    # モデルの学習
    model_random_optuna.fit(x_train, y_train)

    # テストデータでの予測
    y_pred_optuna = model_random_optuna.predict(x_test)

    # 平均二乗誤差の計算
    mse = mean_squared_error(y_test, y_pred_optuna)
    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[32m[I 2023-03-04 11:05:46,659][0m A new study created in memory with name: no-name-3489c532-1f06-4d97-b512-f009d56a006a[0m
[32m[I 2023-03-04 11:05:46,978][0m Trial 0 finished with value: 46550483629033.46 and parameters: {'n_estimators': 155, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 0 with value: 46550483629033.46.[0m
[32m[I 2023-03-04 11:05:47,258][0m Trial 1 finished with value: 46550483629033.46 and parameters: {'n_estimators': 104, 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 0 with value: 46550483629033.46.[0m
[32m[I 2023-03-04 11:05:47,541][0m Trial 2 finished with value: 46550483629033.46 and parameters: {'n_estimators': 122, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 0 with value: 46550483629033.46.[0m
[32m[I 2023-03-04 11:05:47,806][0m Trial 3 finished with value: 46550483629033.46 and parameters: {'n_estimators': 215, 'max_depth': 9, 'min_samples_split': 6,

[32m[I 2023-03-04 11:05:56,366][0m Trial 34 finished with value: 46550483629033.46 and parameters: {'n_estimators': 139, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 0 with value: 46550483629033.46.[0m
[32m[I 2023-03-04 11:05:56,716][0m Trial 35 finished with value: 46550483629033.46 and parameters: {'n_estimators': 119, 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 0 with value: 46550483629033.46.[0m
[32m[I 2023-03-04 11:05:56,997][0m Trial 36 finished with value: 46550483629033.46 and parameters: {'n_estimators': 147, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 0 with value: 46550483629033.46.[0m
[32m[I 2023-03-04 11:05:57,343][0m Trial 37 finished with value: 46550483629033.46 and parameters: {'n_estimators': 189, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 5}. Best is trial 0 with value: 46550483629033.46.[0m
[32m[I 2023-03-04 11:05:57,617][0m Trial 38 fi

[32m[I 2023-03-04 11:06:06,389][0m Trial 69 finished with value: 46550483629033.46 and parameters: {'n_estimators': 251, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 46550483629033.46.[0m
[32m[I 2023-03-04 11:06:06,656][0m Trial 70 finished with value: 46550483629033.46 and parameters: {'n_estimators': 274, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 2}. Best is trial 0 with value: 46550483629033.46.[0m
[32m[I 2023-03-04 11:06:06,925][0m Trial 71 finished with value: 46550483629033.46 and parameters: {'n_estimators': 65, 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 0 with value: 46550483629033.46.[0m
[32m[I 2023-03-04 11:06:07,196][0m Trial 72 finished with value: 46550483629033.46 and parameters: {'n_estimators': 91, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 4}. Best is trial 0 with value: 46550483629033.46.[0m
[32m[I 2023-03-04 11:06:07,465][0m Trial 73 fin

In [81]:
best_params = study.best_params
print(f'Best parameters: {best_params}')

Best parameters: {'n_estimators': 155, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 4}


In [82]:
#optunaによるベストパラメータでのモデル作成＆評価
model_random_optuna = RandomForestRegressor(n_estimators=best_params['n_estimators'],
                            max_depth=best_params['max_depth'],
                            min_samples_split=best_params['min_samples_split'],
                            min_samples_leaf=best_params['min_samples_leaf'],
                            random_state=42)
model_random_optuna.fit(x_train, y_train)
y_pred_optuna = rf.predict(x_test)
mse = mean_squared_error(y_test, y_pred_optuna)
print(model_random_optuna.score(x_train, y_train))
print(model_random_optuna.score(x_test, y_test))
print(f'Test MSE: {mse}')

0.7675662197491881
0.7523502665440851
Test MSE: 22531300787875.215


In [83]:
pickle.dump(model_random_optuna,open('SUUMO_random_forest_regressor_byoptuna_model.pkl','wb'))