In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import re

In [None]:

# 学習用データの読み込み
df1 = pd.read_csv('drive/My Drive/signate/ichd/dataset/train_genba.tsv', sep='\t')
df2 = pd.read_csv('drive/My Drive/signate/ichd/dataset/train_goto.tsv', sep='\t')
df = pd.merge(df1, df2, on='pj_no', how='outer')

del df1
del df2

# 評価用データの読み込み
dftest1 = pd.read_csv('drive/My Drive/signate/ichd/dataset/test_genba.tsv', sep='\t')
dftest2 = pd.read_csv('drive/My Drive/signate/ichd/dataset/test_goto.tsv', sep='\t')
dftest = pd.merge(dftest1, dftest2, on='pj_no', how='outer')

del dftest1
del dftest2

# 提出用id
dftest_id = pd.DataFrame()
dftest_id = dftest['id'] 

# 契約金額(keiyaku_pr)
df_keiyaku_pr = pd.DataFrame()
df_keiyaku_pr = df['keiyaku_pr']

df = df.drop('keiyaku_pr', axis=1)

# 住居表示(jukyo)
# 学習用住居表示(jukyo)の前処理
df_jukyo = df['jukyo']
a = []
for place in df_jukyo:
    m = re.match( r'(.*[都|道|府|県])(.*?[市|区|町|村])(.*)', place)
    if m and len(m.groups()) >= 3:
        a.append(m.groups())

df_new = pd.DataFrame(a)
df_new.columns = ['県', '市区町村', '場所']

df = df.drop('jukyo', axis=1)
df = pd.concat(objs=[df, df_new], axis=1)

del a

# 評価用住居表示(jukyo)の前処理
dftest_jukyo = dftest['jukyo']
a = []
for place in dftest_jukyo:
    m = re.match( r'(.*[都|道|府|県])(.*?[市|区|町|村])(.*)', place)
    if m and len(m.groups()) >= 3:
        a.append(m.groups())

dftest_new = pd.DataFrame(a)
dftest_new.columns = ['県', '市区町村', '場所']

dftest = dftest.drop('jukyo', axis=1)
dftest = pd.concat(objs=[dftest, dftest_new], axis=1)

# 用途地域
# 学習用データの用途地域(yoto1, yoto2)
df_yoto = pd.DataFrame()

df_yoto = pd.concat(objs=[df_yoto, df['yoto1']], axis=1)
df_yoto = pd.concat(objs=[df_yoto, df['yoto2']], axis=1)
df = df.drop('yoto1', axis=1)
df = df.drop('yoto2', axis=1)

df_yoto = df_yoto.fillna(value=0)

# 評価用データの用途地域(yoto1, yoto2)
dftest_yoto = pd.DataFrame()

dftest_yoto = pd.concat(objs=[dftest_yoto, dftest['yoto1']], axis=1)
dftest_yoto = pd.concat(objs=[dftest_yoto, dftest['yoto2']], axis=1)
dftest = dftest.drop('yoto1', axis=1)
dftest = dftest.drop('yoto2', axis=1)

dftest_yoto = dftest_yoto.fillna(value=0)

# 学習用用途地域(df_yoto)の前処理
df_yoto_result = pd.DataFrame()

for elem in ['第一種低層住居専用地域','第二種低層住居専用地域','第一種中高層住居専用地域','第二種中高層住居専用地域','第一種住居地域','第二種住居地域','準住居地域','田園住居地域','近隣商業地域','商業地域','準工業地域','工業地域','工業専用地域','指定のない区域']:
    df_yoto_result[elem] = df_yoto.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

# 評価用用途地域(dftest_yoto)の前処理
dftest_yoto_result = pd.DataFrame()

for elem in ['第一種低層住居専用地域','第二種低層住居専用地域','第一種中高層住居専用地域','第二種中高層住居専用地域','第一種住居地域','第二種住居地域','準住居地域','田園住居地域','近隣商業地域','商業地域','準工業地域','工業地域','工業専用地域','指定のない区域']:
    dftest_yoto_result[elem] = dftest_yoto.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

df = pd.concat(objs=[df, df_yoto_result], axis=1)
dftest = pd.concat(objs=[dftest, dftest_yoto_result], axis=1)

del df_yoto
del dftest_yoto
del df_yoto_result
del dftest_yoto_result

# 都市計画区域別(toshikuiki)
# 学習用データの都市計画区域別(toshikuiki1, toshikuiki2)
df_toshikuiki = pd.DataFrame()

df_toshikuiki = pd.concat(objs=[df_toshikuiki, df['toshikuiki1']], axis=1)
df_toshikuiki = pd.concat(objs=[df_toshikuiki, df['toshikuiki2']], axis=1)
df = df.drop('toshikuiki1', axis=1)
df = df.drop('toshikuiki2', axis=1)

df_toshikuiki = df_toshikuiki.fillna(value=0)

# 評価用データの都市計画区域別(toshikuiki1, toshikuiki2)
dftest_toshikuiki = pd.DataFrame()

dftest_toshikuiki = pd.concat(objs=[dftest_toshikuiki, dftest['toshikuiki1']], axis=1)
dftest_toshikuiki = pd.concat(objs=[dftest_toshikuiki, dftest['toshikuiki2']], axis=1)
dftest = dftest.drop('toshikuiki1', axis=1)
dftest = dftest.drop('toshikuiki2', axis=1)

dftest_toshikuiki = dftest_toshikuiki.fillna(value=0)

# 学習用都市計画区域別(toshikuiki)の前処理
df_toshikuiki_result = pd.DataFrame()

for elem in ['市街化区域','市街化調整区域','非線引き区域','準都市計画区域','都市計画区域外']:
    df_toshikuiki_result[elem] = df_toshikuiki.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

# 評価用都市計画区域別(toshikuiki)の前処理
dftest_toshikuiki_result = pd.DataFrame()

for elem in ['市街化区域','市街化調整区域','非線引き区域','準都市計画区域','都市計画区域外']:
    dftest_toshikuiki_result[elem] = dftest_toshikuiki.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

df = pd.concat(objs=[df, df_toshikuiki_result], axis=1)
dftest = pd.concat(objs=[dftest, dftest_toshikuiki_result], axis=1)

del df_toshikuiki
del dftest_toshikuiki
del df_toshikuiki_result
del dftest_toshikuiki_result

# その他規制(hokakisei1, hokakisei2, hokakisei3, hokakisei4)
# 学習用データのその他規制(hokakisei1, hokakisei2, hokakisei3, hokakisei4)
df_hokakisei = pd.DataFrame()

df_hokakisei = pd.concat(objs=[df_hokakisei, df['hokakisei1']], axis=1)
df_hokakisei = pd.concat(objs=[df_hokakisei, df['hokakisei2']], axis=1)
df_hokakisei = pd.concat(objs=[df_hokakisei, df['hokakisei3']], axis=1)
df_hokakisei = pd.concat(objs=[df_hokakisei, df['hokakisei4']], axis=1)
df = df.drop('hokakisei1', axis=1)
df = df.drop('hokakisei2', axis=1)
df = df.drop('hokakisei3', axis=1)
df = df.drop('hokakisei4', axis=1)

df_hokakisei = df_hokakisei.fillna(value=0)

# 評価用データのその他規制(hokakisei1, hokakisei2, hokakisei3, hokakisei4)
dftest_hokakisei = pd.DataFrame()

dftest_hokakisei = pd.concat(objs=[dftest_hokakisei, dftest['hokakisei1']], axis=1)
dftest_hokakisei = pd.concat(objs=[dftest_hokakisei, dftest['hokakisei2']], axis=1)
dftest_hokakisei = pd.concat(objs=[dftest_hokakisei, dftest['hokakisei3']], axis=1)
dftest_hokakisei = pd.concat(objs=[dftest_hokakisei, dftest['hokakisei4']], axis=1)
dftest = dftest.drop('hokakisei1', axis=1)
dftest = dftest.drop('hokakisei2', axis=1)
dftest = dftest.drop('hokakisei3', axis=1)
dftest = dftest.drop('hokakisei4', axis=1)

dftest_hokakisei = dftest_hokakisei.fillna(value=0)

# その他規制の要素種類
df_all_hokakisei = pd.DataFrame()

df_all_hokakisei = pd.concat(objs=[df_hokakisei, dftest_hokakisei], axis=1)

hokakisei_list = []

for hokakisei_name in set(df_all_hokakisei.values.flatten()):
  if isinstance(hokakisei_name, int) != True and isinstance(hokakisei_name, float) != True:
    hokakisei_list.append(hokakisei_name)

# 学習用その他規制データ(df_hokakisei)の前処理
df_hokakisei_result = pd.DataFrame()

for elem in hokakisei_list:
  df_hokakisei_result[elem] = df_hokakisei.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

# 評価用その他規制データ(dftest_hokakisei)の前処理
dftest_hokakisei_result = pd.DataFrame()

for elem in hokakisei_list:
  dftest_hokakisei_result[elem] = dftest_hokakisei.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

df = pd.concat(objs=[df, df_hokakisei_result], axis=1)
dftest = pd.concat(objs=[dftest, dftest_hokakisei_result], axis=1)

# 階数・プラン(levelplan)
# 学習用階数・プラン(levelplan)の前処理
df['1F'] = df['levelplan'].str.contains('1F')
df['1F'] *= 1
df['2F'] = df['levelplan'].str.contains('2F')
df['2F'] *= 1
df['3F'] = df['levelplan'].str.contains('3F')
df['3F'] *= 1
df['1LDK'] = df['levelplan'].str.contains('1LDK')
df['1LDK'] *= 1
df['2LDK'] = df['levelplan'].str.contains('2LDK')
df['2LDK'] *= 1
df['3LDK'] = df['levelplan'].str.contains('3LDK')
df['3LDK'] *= 1
df['4LDK'] = df['levelplan'].str.contains('4LDK')
df['4LDK'] *= 1
df['5LDK'] = df['levelplan'].str.contains('5LDK')
df['5LDK'] *= 1
df['4DK'] = df['levelplan'].str.contains('4DK')
df['4DK'] *= 1
df['S'] = df['levelplan'].str.contains('\+S')
df['S'] *= 1
df['2S'] = df['levelplan'].str.contains('2S')
df['2S'] *= 1

# 評価用階数・プラン(levelplan)の前処理（カラム増加）
dftest['1F'] = dftest['levelplan'].str.contains('1F')
dftest['1F'] *= 1
dftest['2F'] = dftest['levelplan'].str.contains('2F')
dftest['2F'] *= 1
dftest['3F'] = dftest['levelplan'].str.contains('3F')
dftest['3F'] *= 1
dftest['1LDK'] = dftest['levelplan'].str.contains('1LDK')
dftest['1LDK'] *= 1
dftest['2LDK'] = dftest['levelplan'].str.contains('2LDK')
dftest['2LDK'] *= 1
dftest['3LDK'] = dftest['levelplan'].str.contains('3LDK')
dftest['3LDK'] *= 1
dftest['4LDK'] = dftest['levelplan'].str.contains('4LDK')
dftest['4LDK'] *= 1
dftest['5LDK'] = dftest['levelplan'].str.contains('5LDK')
dftest['5LDK'] *= 1
dftest['4DK'] = dftest['levelplan'].str.contains('4DK')
dftest['4DK'] *= 1
dftest['S'] = dftest['levelplan'].str.contains('\+S')
dftest['S'] *= 1
dftest['2S'] = dftest['levelplan'].str.contains('2S')
dftest['2S'] *= 1

df = df.drop('levelplan', axis=1)
dftest = dftest.drop('levelplan', axis=1)

# 個別要因(kobetsu1, kobetsu2, kobetsu3, kobetsu4)
# 学習用データの個別要因(kobetsu1, kobetsu2, kobetsu3, kobetsu4)
df_kobetsu = pd.DataFrame()

df_kobetsu = pd.concat(objs=[df_kobetsu, df['kobetsu1']], axis=1)
df_kobetsu = pd.concat(objs=[df_kobetsu, df['kobetsu2']], axis=1)
df_kobetsu = pd.concat(objs=[df_kobetsu, df['kobetsu3']], axis=1)
df_kobetsu = pd.concat(objs=[df_kobetsu, df['kobetsu4']], axis=1)
df = df.drop('kobetsu1', axis=1)
df = df.drop('kobetsu2', axis=1)
df = df.drop('kobetsu3', axis=1)
df = df.drop('kobetsu4', axis=1)

df_kobetsu = df_kobetsu.fillna(value=0)

# 評価用データの個別要因(kobetsu1, kobetsu2, kobetsu3, kobetsu4)
dftest_kobetsu = pd.DataFrame()

dftest_kobetsu = pd.concat(objs=[dftest_kobetsu, dftest['kobetsu1']], axis=1)
dftest_kobetsu = pd.concat(objs=[dftest_kobetsu, dftest['kobetsu2']], axis=1)
dftest_kobetsu = pd.concat(objs=[dftest_kobetsu, dftest['kobetsu3']], axis=1)
dftest_kobetsu = pd.concat(objs=[dftest_kobetsu, dftest['kobetsu4']], axis=1)
dftest = dftest.drop('kobetsu1', axis=1)
dftest = dftest.drop('kobetsu2', axis=1)
dftest = dftest.drop('kobetsu3', axis=1)
dftest = dftest.drop('kobetsu4', axis=1)

dftest_kobetsu = dftest_kobetsu.fillna(value=0)

# 学習用個別データ(df_kobetsu)の前処理
df_kobetsu_result = pd.DataFrame()

for elem in ['高圧線下','信号近い','信号前','横断歩道前','踏切付近','ごみ置き場前','心理的瑕疵あり','計画道路','地役権有','敷延2ｍ絞りあり','宅内高低差あり','嫌悪施設隣接','アパート南隣','街道沿い','交通量多い','裏道','行き止まり','行き止まり途中','車進入困難','前面道が坂途中','眺望良','床暖房付','エネファーム付','角地','二方路','三方路']:
    df_kobetsu_result[elem] = df_kobetsu.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

# 評価用個別データ(dftest_kobetsu)の前処理
dftest_kobetsu_result = pd.DataFrame()

for elem in ['高圧線下','信号近い','信号前','横断歩道前','踏切付近','ごみ置き場前','心理的瑕疵あり','計画道路','地役権有','敷延2ｍ絞りあり','宅内高低差あり','嫌悪施設隣接','アパート南隣','街道沿い','交通量多い','裏道','行き止まり','行き止まり途中','車進入困難','前面道が坂途中','眺望良','床暖房付','エネファーム付','角地','二方路','三方路']:
    dftest_kobetsu_result[elem] = dftest_kobetsu.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

df = pd.concat(objs=[df, df_kobetsu_result], axis=1)
dftest = pd.concat(objs=[dftest, dftest_kobetsu_result], axis=1)

del df_kobetsu
del dftest_kobetsu
del df_kobetsu_result
del dftest_kobetsu_result



print(df.shape)
print(dftest.shape)

(6461, 236)
(4273, 236)


In [None]:
# 文字から数字に変換する方法(2値からなるカラム列のみ)
from sklearn.preprocessing import LabelEncoder

from_two = ['kaoku_um', 'yheki_umu', 'yheki_yohi', 'kborjs', 'chikukeikaku', 'keikakuroad', 'kaihatsukyoka', 't53kyoka', 'hokakyoka', 'fi4m_yohi', 'fi3m_yohi', 'bus_yohi', 'sho_conv', 'sho_super', 'sho_shoten', 'sho_market', 'shu_jutaku', 'shu_park', 'shu_shop', 'shu_factory', 'shu_hvline', 'shu_tower', 'shu_bochi', 'shu_sogi', 'shu_zoki', 'shu_kokyo', 'shu_highway', 'shu_kaido', 'shu_line_ari', 'shu_line_nashi', 'shu_soon', 'rs_e_kdate2', 'rs_e_kdate3', 'rs_e_parking', 'rs_e_zoki', 'rs_e_m_ari', 'rs_e_m_nashi', 'rs_e_tahata', 'rs_w_kdate2', 'rs_w_kdate3', 'rs_w_parking', 'rs_w_zoki', 'rs_w_m_ari', 'rs_w_m_nashi', 'rs_w_tahata', 'rs_s_kdate2', 'rs_s_kdate3', 'rs_s_parking', 'rs_s_zoki', 'rs_s_m_ari', 'rs_s_m_nashi', 'rs_s_tahata', 'rs_n_kdate2', 'rs_n_kdate3', 'rs_n_parking', 'rs_n_zoki', 'rs_n_m_ari', 'rs_n_m_nashi', 'rs_n_tahata']

for col in from_two:
  lbl = LabelEncoder()
  df[col] = lbl.fit_transform(list(df[col].values))
  dftest[col] = lbl.fit_transform(list(dftest[col].values))
  
print(df.shape)
print(dftest.shape)

(6461, 236)
(4273, 236)


In [None]:
# onehot encoding
onehot = []
for name, type in zip(df.columns, df.dtypes):
  if type == 'object':
    onehot.append(name)
  
df = pd.get_dummies(df, prefix=onehot)
dftest = pd.get_dummies(dftest, prefix=onehot)

print(df.shape)
print(dftest.shape)

(6461, 8455)
(4273, 5992)


In [None]:
# 文字から数字に変換する方法
from sklearn.preprocessing import LabelEncoder

cat_features = []
onehot = []

for i, j in zip(df.dtypes, df.columns):
  if i == 'object':
    cat_features.append(j)
    onehot.append(j)

for col in cat_features:
  lbl = LabelEncoder()
  df[col] = lbl.fit_transform(list(df[col].values))
  dftest[col] = lbl.fit_transform(list(dftest[col].values))
  
print(df.shape)
print(dftest.shape)

(6461, 8455)
(4273, 5992)


In [None]:
# 欠損値を埋める
df = df.fillna(df.mean())
dftest = dftest.fillna(df.mean())

print(df.shape)
print(dftest.shape)

(6461, 8455)
(4273, 5992)


In [None]:
# ペアワイズ法で欠損値がある列を削除
df_dropped = pd.DataFrame()
dftest_dropped = pd.DataFrame()

for num, name in zip(df.isnull().sum(), df.columns):
  if num != 0:
    df_dropped = pd.concat(objs=[df_dropped, df[name]], axis=1)
    dftest_dropped = pd.concat(objs=[dftest_dropped, dftest[name]], axis=1)
    df = df.drop(name, axis=1)
    dftest = dftest.drop(name, axis=1)
    
for num, name in zip(dftest.isnull().sum(), dftest.columns):
  if num != 0:
    df_dropped = pd.concat(objs=[df_dropped, df[name]], axis=1)
    dftest_dropped = pd.concat(objs=[dftest_dropped, dftest[name]], axis=1)
    df = df.drop(name, axis=1)
    dftest = dftest.drop(name, axis=1)
    
print(df.shape)
print(dftest.shape)

(6461, 8455)
(4273, 5992)


In [None]:
# dfとdftestで異なるカラムを削除
common_col = (df.columns & dftest.columns)
print(len(common_col))
df = df[common_col]
dftest = dftest[common_col]
    
print(df.shape)
print(dftest.shape)

1280
(6461, 1280)
(4273, 1280)


In [None]:
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import preprocessing, linear_model, svm
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

# モデルの作成
#clf = xgb.XGBRegressor()
#clf = RandomForestRegressor()
#clf = lgb.LGBMRegressor()

#clf = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
#              importance_type='split', learning_rate=0.01, max_depth=17,
#              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
#              n_estimators=50000, n_jobs=-1, num_leaves=31, objective=None,
#              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
#              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

mod = lgb.LGBMRegressor()

parameters = {'max_depth': [15, 16, 17, 18, 19, 20],
              'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
              'n_estimators': [100, 1000, 5000, 10000, 50000],}

model = GridSearchCV(mod,
                     parameters,
                     cv=5)

# 説明変数の設定
X = df.as_matrix()

# 目的変数の設定
Y = df_keiyaku_pr.as_matrix()

# 標準化
#sc = StandardScaler()
#sc.fit(X)

#del X_train
#del X_test

#X_train =  sc.transform(X)

# 学習
model.fit(X, Y)

print(df.shape)
print(dftest.shape)
print(dftest_id.shape)
print(df_keiyaku_pr.shape)



In [None]:
#予測
XX = dftest.as_matrix()

#X_test = sc.transform(XX)

YY = model.predict(XX)

# 予測データ(df_pre)
df_pre = pd.DataFrame(YY)

# 提出用に変換
df_pre = df_pre.astype(int)
df_pre = df_pre.round()
df_pre = pd.concat(objs=[dftest_id, df_pre], axis=1)
df_pre.head()

In [None]:
# tsvファイル作成
df_pre.to_csv('test_pre_1.tsv', sep='\t', index=False, header=False)

In [None]:
df_pre = df_pre.drop('id', axis=1)
df_pre /= 100000
df_pre = df_pre.astype(int)
df_pre *= 100000
df_pre = pd.concat(objs=[dftest_id, df_pre], axis=1)
df_pre.head()

In [None]:
df_pre.to_csv('test_pre_2.tsv', sep='\t', index=False, header=False)

In [None]:
print(model.best_estimator_)

In [None]:
# 終了時に音
import numpy as np
import IPython as IP
fs = 48000
duration = 3.0
def play_sine(f):
    t = np.linspace(0., duration, int(fs * duration))  # ( start, stop, num of data )
    x = np.sin(f * (2. * np.pi) * t)
    IP.display.display(IP.display.Audio(x, rate=fs, autoplay=True))
play_sine(440)

In [None]:
# 標準化
#from sklearn.preprocessing import StandardScaler

#num_features = df.columns

#for col in num_features:
#  scaler = StandardScaler()
#  df[col] = scaler.fit_transform(np.array(df[col].values).reshape(-1, 1))
#  dftest[col] = scaler.fit_transform(np.array(dftest[col].values).reshape(-1, 1))
  
#print(df.shape)
#print(dftest.shape)

# 標準化
#sc = StandardScaler()
#sc.fit(X_train)
#X_train_std = sc.transform(X_train)
#X_test_std = sc.transform(X_test)

In [None]:
# ステップワイズ法
#import statsmodels.api as sm

#model = sm.OLS(df_keiyaku_pr, sm.add_constant(df))
#result = model.fit()

#for name, num in zip(df.columns, result.pvalues):
#  if num > 0.05:
#    df = df.drop(name, axis=1)
#    dftest = dftest.drop(name, axis=1)
    
#print(df.shape)
#print(dftest.shape)

In [None]:
for name in onehot:
  df_dummy = pd.get_dummies(df[name])
  df = df.drop(name, axis=1)
  df = pd.concat(objs=[df, df_dummy], axis=1)
  dftest_dummy = pd.get_dummies(dftest[name])
  dftest = dftest.drop(name, axis=1)
  dftest = pd.concat(objs=[dftest, dftest_dummy], axis=1)

name = 'jukyo'
df_dummy = pd.get_dummies(df[name])
df = df.drop(name, axis=1)
df = pd.concat(objs=[df, df_dummy], axis=1)
dftest_dummy = pd.get_dummies(dftest[name])
dftest = dftest.drop(name, axis=1)
dftest = pd.concat(objs=[dftest, dftest_dummy], axis=1)

df.head()

In [None]:
# 学習用回数,プラン(df_levelplan)データの前処理
#df_levelplan = pd.DataFrame(df['levelplan'])

#df_levelplan_result = pd.DataFrame()

#for elem in ['土地売り','1F/1LDK','1F/1LDK+S','1F/1LDK+2S','1F/2LDK','1F/2LDK+S','1F/2LDK+2S','1F/3DK','1F/3LDK','1F/3LDK+S','1F/3LDK+2S','1F/4DK','1F/4LDK','1F/4LDK+S','1F/4LDK+2S','1F/5DK','1F/5LDK','1F/5LDK+S','1F/5LDK+2S','2F/1LDK+S','2F/1LDK+2S','2F/2LDK','2F/2LDK+S','2F/2LDK+2S','2F/3DK','2F/3LDK','2F/3LDK+S','2F/3LDK+2S','2F/4DK','2F/4LDK','2F/4LDK+S','2F/4LDK+2S','2F/5DK','2F/5LDK','2F/5LDK+S','2F/5LDK+2S','3F/1LDK+S','3F/1LDK+2S','3F/2LDK','3F/2LDK+S','3F/2LDK+2S','3F/3DK','3F/3LDK','3F/3LDK+S','3F/3LDK+2S','3F/4DK','3F/4LDK','3F/4LDK+S','3F/4LDK+2S','3F/5DK','3F/5LDK','3F/5LDK+S','3F/5LDK+2S']:
#    df_levelplan_result[elem] = df_levelplan.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)
  
#df = pd.concat(objs=[df, df_levelplan_result], axis=1)

# 学習用回数,プラン(dftest_levelplan)データの前処理
#dftest_levelplan = pd.DataFrame(dftest['levelplan'])

#dftest_levelplan_result = pd.DataFrame()

#for elem in ['土地売り','1F/1LDK','1F/1LDK+S','1F/1LDK+2S','1F/2LDK','1F/2LDK+S','1F/2LDK+2S','1F/3DK','1F/3LDK','1F/3LDK+S','1F/3LDK+2S','1F/4DK','1F/4LDK','1F/4LDK+S','1F/4LDK+2S','1F/5DK','1F/5LDK','1F/5LDK+S','1F/5LDK+2S','2F/1LDK+S','2F/1LDK+2S','2F/2LDK','2F/2LDK+S','2F/2LDK+2S','2F/3DK','2F/3LDK','2F/3LDK+S','2F/3LDK+2S','2F/4DK','2F/4LDK','2F/4LDK+S','2F/4LDK+2S','2F/5DK','2F/5LDK','2F/5LDK+S','2F/5LDK+2S','3F/1LDK+S','3F/1LDK+2S','3F/2LDK','3F/2LDK+S','3F/2LDK+2S','3F/3DK','3F/3LDK','3F/3LDK+S','3F/3LDK+2S','3F/4DK','3F/4LDK','3F/4LDK+S','3F/4LDK+2S','3F/5DK','3F/5LDK','3F/5LDK+S','3F/5LDK+2S']:
#    dftest_levelplan_result[elem] = dftest_levelplan.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

#dftest = pd.concat(objs=[dftest, dftest_levelplan_result], axis=1)

# 学習用データのその他規制(hokakisei1, hokakisei2, hokakisei3, hokakisei4)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
import xgboost as xgb
import lightgbm as lgb

X = df.as_matrix()
y = df_keiyaku_pr.as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

#mod = lgb.LGBMRegressor()

#parameters = {'max_depth': [15, 16, 17, 18, 19, 20],
#              'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
#              'n_estimators': [100, 1000, 5000, 10000, 50000],}

#model = GridSearchCV(mod,
#                     parameters,
#                     cv=5)

model = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_depth=17,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=7500, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

#model = Lasso()
#model = RandomForestRegressor()
#model = LinearRegression()
#model = xgb.XGBRegressor()
#model = lgb.LGBMRegressor()

model.fit(X_train, y_train)

print('決定係数(train):{:.3f}'.format(model.score(X_train, y_train)))
print('決定係数(test):{:.3f}'.format(model.score(X_test, y_test)))

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


決定係数(train):0.997
決定係数(test):0.813
