In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import re

In [0]:
# 学習用データの読み込み
df1 = pd.read_csv('train_genba.tsv', sep='\t')
df2 = pd.read_csv('train_goto.tsv', sep='\t')
df = pd.merge(df1, df2, on='pj_no', how='outer')

del df1
del df2

# 評価用データの読み込み
dftest1 = pd.read_csv('test_genba.tsv', sep='\t')
dftest2 = pd.read_csv('test_goto.tsv', sep='\t')
dftest = pd.merge(dftest1, dftest2, on='pj_no', how='outer')

del dftest1
del dftest2

print(df.shape)
print(dftest.shape)

(6461, 158)
(4273, 157)


In [0]:
# 提出用id
dftest_id = pd.DataFrame()
dftest_id = dftest['id'] 

print(df.shape)
print(dftest.shape)
print(dftest_id.shape)

(6461, 158)
(4273, 157)
(4273,)


In [0]:
# 契約金額(keiyaku_pr)
df_keiyaku_pr = pd.DataFrame()
df_keiyaku_pr = df['keiyaku_pr']

df = df.drop('keiyaku_pr', axis=1)

print(df.shape)
print(dftest.shape)
print(dftest_id.shape)
print(df_keiyaku_pr.shape)

(6461, 157)
(4273, 157)
(4273,)
(6461,)


In [0]:
# 学習用データの個別要因(kobetsu1, kobetsu2, kobetsu3, kobetsu4)
df_kobetsu = pd.DataFrame()

df_kobetsu = pd.concat(objs=[df_kobetsu, df['kobetsu1']], axis=1)
df_kobetsu = pd.concat(objs=[df_kobetsu, df['kobetsu2']], axis=1)
df_kobetsu = pd.concat(objs=[df_kobetsu, df['kobetsu3']], axis=1)
df_kobetsu = pd.concat(objs=[df_kobetsu, df['kobetsu4']], axis=1)
df = df.drop('kobetsu1', axis=1)
df = df.drop('kobetsu2', axis=1)
df = df.drop('kobetsu3', axis=1)
df = df.drop('kobetsu4', axis=1)

df_kobetsu = df_kobetsu.fillna(value=0)

# 評価用データの個別要因(kobetsu1, kobetsu2, kobetsu3, kobetsu4)
dftest_kobetsu = pd.DataFrame()

dftest_kobetsu = pd.concat(objs=[dftest_kobetsu, dftest['kobetsu1']], axis=1)
dftest_kobetsu = pd.concat(objs=[dftest_kobetsu, dftest['kobetsu2']], axis=1)
dftest_kobetsu = pd.concat(objs=[dftest_kobetsu, dftest['kobetsu3']], axis=1)
dftest_kobetsu = pd.concat(objs=[dftest_kobetsu, dftest['kobetsu4']], axis=1)
dftest = dftest.drop('kobetsu1', axis=1)
dftest = dftest.drop('kobetsu2', axis=1)
dftest = dftest.drop('kobetsu3', axis=1)
dftest = dftest.drop('kobetsu4', axis=1)

dftest_kobetsu = dftest_kobetsu.fillna(value=0)


# カラムの順番、個数は df と dftest の間に相違なし

print(df.shape)
print(dftest.shape)
print(dftest_id.shape)
print(df_keiyaku_pr.shape)
print(df_kobetsu.shape)
print(dftest_kobetsu.shape)

(6461, 153)
(4273, 153)
(4273,)
(6461,)
(6461, 4)
(4273, 4)


In [0]:
# 学習用個別データ(df_kobetsu)の前処理
df_kobetsu_result = pd.DataFrame()

for elem in ['高圧線下','信号近い','信号前','横断歩道前','踏切付近','ごみ置き場前','心理的瑕疵あり','計画道路','地役権有','敷延2ｍ絞りあり','宅内高低差あり','嫌悪施設隣接','アパート南隣','街道沿い','交通量多い','裏道','行き止まり','行き止まり途中','車進入困難','前面道が坂途中','眺望良','床暖房付','エネファーム付','角地','二方路','三方路']:
    df_kobetsu_result[elem] = df_kobetsu.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

# 評価用個別データ(dftest_kobetsu)の前処理
dftest_kobetsu_result = pd.DataFrame()

for elem in ['高圧線下','信号近い','信号前','横断歩道前','踏切付近','ごみ置き場前','心理的瑕疵あり','計画道路','地役権有','敷延2ｍ絞りあり','宅内高低差あり','嫌悪施設隣接','アパート南隣','街道沿い','交通量多い','裏道','行き止まり','行き止まり途中','車進入困難','前面道が坂途中','眺望良','床暖房付','エネファーム付','角地','二方路','三方路']:
    dftest_kobetsu_result[elem] = dftest_kobetsu.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

df = pd.concat(objs=[df, df_kobetsu_result], axis=1)
dftest = pd.concat(objs=[dftest, dftest_kobetsu_result], axis=1)

del df_kobetsu
del dftest_kobetsu
del df_kobetsu_result
del dftest_kobetsu_result

print(df.shape)
print(dftest.shape)
print(dftest_id.shape)
print(df_keiyaku_pr.shape)

(6461, 179)
(4273, 179)
(4273,)
(6461,)


In [0]:
# 学習用データのその他規制(hokakisei1, hokakisei2, hokakisei3, hokakisei4)
df_hokakisei = pd.DataFrame()

df_hokakisei = pd.concat(objs=[df_hokakisei, df['hokakisei1']], axis=1)
df_hokakisei = pd.concat(objs=[df_hokakisei, df['hokakisei2']], axis=1)
df_hokakisei = pd.concat(objs=[df_hokakisei, df['hokakisei3']], axis=1)
df_hokakisei = pd.concat(objs=[df_hokakisei, df['hokakisei4']], axis=1)
df = df.drop('hokakisei1', axis=1)
df = df.drop('hokakisei2', axis=1)
df = df.drop('hokakisei3', axis=1)
df = df.drop('hokakisei4', axis=1)

df_hokakisei = df_hokakisei.fillna(value=0)

# 評価用データのその他規制(hokakisei1, hokakisei2, hokakisei3, hokakisei4)
dftest_hokakisei = pd.DataFrame()

dftest_hokakisei = pd.concat(objs=[dftest_hokakisei, dftest['hokakisei1']], axis=1)
dftest_hokakisei = pd.concat(objs=[dftest_hokakisei, dftest['hokakisei2']], axis=1)
dftest_hokakisei = pd.concat(objs=[dftest_hokakisei, dftest['hokakisei3']], axis=1)
dftest_hokakisei = pd.concat(objs=[dftest_hokakisei, dftest['hokakisei4']], axis=1)
dftest = dftest.drop('hokakisei1', axis=1)
dftest = dftest.drop('hokakisei2', axis=1)
dftest = dftest.drop('hokakisei3', axis=1)
dftest = dftest.drop('hokakisei4', axis=1)

dftest_hokakisei = dftest_hokakisei.fillna(value=0)

print(df.shape)
print(dftest.shape)
print(dftest_id.shape)
print(df_keiyaku_pr.shape)
print(df_hokakisei.shape)
print(dftest_hokakisei.shape)

(6461, 175)
(4273, 175)
(4273,)
(6461,)
(6461, 4)
(4273, 4)


In [0]:
# その他規制の要素種類
df_all_hokakisei = pd.DataFrame()

df_all_hokakisei = pd.concat(objs=[df_hokakisei, dftest_hokakisei], axis=1)

hokakisei_list = []

for hokakisei_name in set(df_all_hokakisei.values.flatten()):
  if isinstance(hokakisei_name, int) != True and isinstance(hokakisei_name, float) != True:
    hokakisei_list.append(hokakisei_name)

# 確認用  
#for hokakisei_name in hokakisei_list:
#print(hokakisei_name)

# 学習用その他規制データ(df_hokakisei)の前処理
df_hokakisei_result = pd.DataFrame()

for elem in hokakisei_list:
  df_hokakisei_result[elem] = df_hokakisei.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

# 評価用その他規制データ(dftest_hokakisei)の前処理
dftest_hokakisei_result = pd.DataFrame()

for elem in hokakisei_list:
  dftest_hokakisei_result[elem] = dftest_hokakisei.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

df = pd.concat(objs=[df, df_hokakisei_result], axis=1)
dftest = pd.concat(objs=[dftest, dftest_hokakisei_result], axis=1)

del hokakisei_list
del df_hokakisei
del dftest_hokakisei
del df_hokakisei_result
del dftest_hokakisei_result

print(df.shape)
print(dftest.shape)
print(dftest_id.shape)
print(df_keiyaku_pr.shape)

(6461, 209)
(4273, 209)
(4273,)
(6461,)


In [11]:
# 学習用データの読み込み
df1 = pd.read_csv('drive/My Drive/signate/ichd/dataset/train_genba.tsv', sep='\t')
df2 = pd.read_csv('drive/My Drive/signate/ichd/dataset/train_goto.tsv', sep='\t')
df = pd.merge(df1, df2, on='pj_no', how='outer')

del df1
del df2

# 評価用データの読み込み
dftest1 = pd.read_csv('drive/My Drive/signate/ichd/dataset/test_genba.tsv', sep='\t')
dftest2 = pd.read_csv('drive/My Drive/signate/ichd/dataset/test_goto.tsv', sep='\t')
dftest = pd.merge(dftest1, dftest2, on='pj_no', how='outer')

del dftest1
del dftest2

# 提出用id
dftest_id = pd.DataFrame()
dftest_id = dftest['id'] 

# 契約金額(keiyaku_pr)
df_keiyaku_pr = pd.DataFrame()
df_keiyaku_pr = df['keiyaku_pr']

df = df.drop('keiyaku_pr', axis=1)

# 住居表示(jukyo)
# 学習用住居表示(jukyo)の前処理
df_jukyo = df['jukyo']
a = []
for place in df_jukyo:
    m = re.match( r'(.*[都|道|府|県])(.*?[市|区|町|村])(.*)', place)
    if m and len(m.groups()) >= 3:
        a.append(m.groups())

df_new = pd.DataFrame(a)
df_new.columns = ['ken', 'place_1', 'place_2']

df = df.drop('jukyo', axis=1)
df = pd.concat(objs=[df, df_new], axis=1)

del a

# 評価用住居表示(jukyo)の前処理
dftest_jukyo = dftest['jukyo']
a = []
for place in dftest_jukyo:
    m = re.match( r'(.*[都|道|府|県])(.*?[市|区|町|村])(.*)', place)
    if m and len(m.groups()) >= 3:
        a.append(m.groups())

dftest_new = pd.DataFrame(a)
dftest_new.columns = ['ken', 'place_1', 'place_2']

dftest = dftest.drop('jukyo', axis=1)
dftest = pd.concat(objs=[dftest, dftest_new], axis=1)

# 用途地域
# 学習用データの用途地域(yoto1, yoto2)
df_yoto = pd.DataFrame()

df_yoto = pd.concat(objs=[df_yoto, df['yoto1']], axis=1)
df_yoto = pd.concat(objs=[df_yoto, df['yoto2']], axis=1)
df = df.drop('yoto1', axis=1)
df = df.drop('yoto2', axis=1)

df_yoto = df_yoto.fillna(value=0)

# 評価用データの用途地域(yoto1, yoto2)
dftest_yoto = pd.DataFrame()

dftest_yoto = pd.concat(objs=[dftest_yoto, dftest['yoto1']], axis=1)
dftest_yoto = pd.concat(objs=[dftest_yoto, dftest['yoto2']], axis=1)
dftest = dftest.drop('yoto1', axis=1)
dftest = dftest.drop('yoto2', axis=1)

dftest_yoto = dftest_yoto.fillna(value=0)

# 学習用用途地域(df_yoto)の前処理
df_yoto_result = pd.DataFrame()

for elem in ['第一種低層住居専用地域','第二種低層住居専用地域','第一種中高層住居専用地域','第二種中高層住居専用地域','第一種住居地域','第二種住居地域','準住居地域','田園住居地域','近隣商業地域','商業地域','準工業地域','工業地域','工業専用地域','指定のない区域']:
    df_yoto_result[elem] = df_yoto.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

# 評価用用途地域(dftest_yoto)の前処理
dftest_yoto_result = pd.DataFrame()

for elem in ['第一種低層住居専用地域','第二種低層住居専用地域','第一種中高層住居専用地域','第二種中高層住居専用地域','第一種住居地域','第二種住居地域','準住居地域','田園住居地域','近隣商業地域','商業地域','準工業地域','工業地域','工業専用地域','指定のない区域']:
    dftest_yoto_result[elem] = dftest_yoto.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

df = pd.concat(objs=[df, df_yoto_result], axis=1)
dftest = pd.concat(objs=[dftest, dftest_yoto_result], axis=1)

del df_yoto
del dftest_yoto
del df_yoto_result
del dftest_yoto_result

# 都市計画区域別(toshikuiki)
# 学習用データの都市計画区域別(toshikuiki1, toshikuiki2)
df_toshikuiki = pd.DataFrame()

df_toshikuiki = pd.concat(objs=[df_toshikuiki, df['toshikuiki1']], axis=1)
df_toshikuiki = pd.concat(objs=[df_toshikuiki, df['toshikuiki2']], axis=1)
df = df.drop('toshikuiki1', axis=1)
df = df.drop('toshikuiki2', axis=1)

df_toshikuiki = df_toshikuiki.fillna(value=0)

# 評価用データの都市計画区域別(toshikuiki1, toshikuiki2)
dftest_toshikuiki = pd.DataFrame()

dftest_toshikuiki = pd.concat(objs=[dftest_toshikuiki, dftest['toshikuiki1']], axis=1)
dftest_toshikuiki = pd.concat(objs=[dftest_toshikuiki, dftest['toshikuiki2']], axis=1)
dftest = dftest.drop('toshikuiki1', axis=1)
dftest = dftest.drop('toshikuiki2', axis=1)

dftest_toshikuiki = dftest_toshikuiki.fillna(value=0)

# 学習用都市計画区域別(toshikuiki)の前処理
df_toshikuiki_result = pd.DataFrame()

for elem in ['市街化区域','市街化調整区域','非線引き区域','準都市計画区域','都市計画区域外']:
    df_toshikuiki_result[elem] = df_toshikuiki.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

# 評価用都市計画区域別(toshikuiki)の前処理
dftest_toshikuiki_result = pd.DataFrame()

for elem in ['市街化区域','市街化調整区域','非線引き区域','準都市計画区域','都市計画区域外']:
    dftest_toshikuiki_result[elem] = dftest_toshikuiki.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

df = pd.concat(objs=[df, df_toshikuiki_result], axis=1)
dftest = pd.concat(objs=[dftest, dftest_toshikuiki_result], axis=1)

del df_toshikuiki
del dftest_toshikuiki
del df_toshikuiki_result
del dftest_toshikuiki_result

# その他規制(hokakisei1, hokakisei2, hokakisei3, hokakisei4)
# 学習用データのその他規制(hokakisei1, hokakisei2, hokakisei3, hokakisei4)
df_hokakisei = pd.DataFrame()

df_hokakisei = pd.concat(objs=[df_hokakisei, df['hokakisei1']], axis=1)
df_hokakisei = pd.concat(objs=[df_hokakisei, df['hokakisei2']], axis=1)
df_hokakisei = pd.concat(objs=[df_hokakisei, df['hokakisei3']], axis=1)
df_hokakisei = pd.concat(objs=[df_hokakisei, df['hokakisei4']], axis=1)
df = df.drop('hokakisei1', axis=1)
df = df.drop('hokakisei2', axis=1)
df = df.drop('hokakisei3', axis=1)
df = df.drop('hokakisei4', axis=1)

df_hokakisei = df_hokakisei.fillna(value=0)

# 評価用データのその他規制(hokakisei1, hokakisei2, hokakisei3, hokakisei4)
dftest_hokakisei = pd.DataFrame()

dftest_hokakisei = pd.concat(objs=[dftest_hokakisei, dftest['hokakisei1']], axis=1)
dftest_hokakisei = pd.concat(objs=[dftest_hokakisei, dftest['hokakisei2']], axis=1)
dftest_hokakisei = pd.concat(objs=[dftest_hokakisei, dftest['hokakisei3']], axis=1)
dftest_hokakisei = pd.concat(objs=[dftest_hokakisei, dftest['hokakisei4']], axis=1)
dftest = dftest.drop('hokakisei1', axis=1)
dftest = dftest.drop('hokakisei2', axis=1)
dftest = dftest.drop('hokakisei3', axis=1)
dftest = dftest.drop('hokakisei4', axis=1)

dftest_hokakisei = dftest_hokakisei.fillna(value=0)

# その他規制の要素種類
df_all_hokakisei = pd.DataFrame()

df_all_hokakisei = pd.concat(objs=[df_hokakisei, dftest_hokakisei], axis=1)

hokakisei_list = []

for hokakisei_name in set(df_all_hokakisei.values.flatten()):
  if isinstance(hokakisei_name, int) != True and isinstance(hokakisei_name, float) != True:
    hokakisei_list.append(hokakisei_name)

# 学習用その他規制データ(df_hokakisei)の前処理
df_hokakisei_result = pd.DataFrame()

for elem in hokakisei_list:
  df_hokakisei_result[elem] = df_hokakisei.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

# 評価用その他規制データ(dftest_hokakisei)の前処理
dftest_hokakisei_result = pd.DataFrame()

for elem in hokakisei_list:
  dftest_hokakisei_result[elem] = dftest_hokakisei.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

df = pd.concat(objs=[df, df_hokakisei_result], axis=1)
dftest = pd.concat(objs=[dftest, dftest_hokakisei_result], axis=1)

# 階数・プラン(levelplan)
# 学習用階数・プラン(levelplan)の前処理
df['1F'] = df['levelplan'].str.contains('1F')
df['1F'] *= 1
df['2F'] = df['levelplan'].str.contains('2F')
df['2F'] *= 1
df['3F'] = df['levelplan'].str.contains('3F')
df['3F'] *= 1
df['1LDK'] = df['levelplan'].str.contains('1LDK')
df['1LDK'] *= 1
df['2LDK'] = df['levelplan'].str.contains('2LDK')
df['2LDK'] *= 1
df['3LDK'] = df['levelplan'].str.contains('3LDK')
df['3LDK'] *= 1
df['4LDK'] = df['levelplan'].str.contains('4LDK')
df['4LDK'] *= 1
df['5LDK'] = df['levelplan'].str.contains('5LDK')
df['5LDK'] *= 1
df['4DK'] = df['levelplan'].str.contains('4DK')
df['4DK'] *= 1
df['S'] = df['levelplan'].str.contains('\+S')
df['S'] *= 1
df['2S'] = df['levelplan'].str.contains('2S')
df['2S'] *= 1

# 評価用階数・プラン(levelplan)の前処理（カラム増加）
dftest['1F'] = dftest['levelplan'].str.contains('1F')
dftest['1F'] *= 1
dftest['2F'] = dftest['levelplan'].str.contains('2F')
dftest['2F'] *= 1
dftest['3F'] = dftest['levelplan'].str.contains('3F')
dftest['3F'] *= 1
dftest['1LDK'] = dftest['levelplan'].str.contains('1LDK')
dftest['1LDK'] *= 1
dftest['2LDK'] = dftest['levelplan'].str.contains('2LDK')
dftest['2LDK'] *= 1
dftest['3LDK'] = dftest['levelplan'].str.contains('3LDK')
dftest['3LDK'] *= 1
dftest['4LDK'] = dftest['levelplan'].str.contains('4LDK')
dftest['4LDK'] *= 1
dftest['5LDK'] = dftest['levelplan'].str.contains('5LDK')
dftest['5LDK'] *= 1
dftest['4DK'] = dftest['levelplan'].str.contains('4DK')
dftest['4DK'] *= 1
dftest['S'] = dftest['levelplan'].str.contains('\+S')
dftest['S'] *= 1
dftest['2S'] = dftest['levelplan'].str.contains('2S')
dftest['2S'] *= 1

df = df.drop('levelplan', axis=1)
dftest = dftest.drop('levelplan', axis=1)

# 個別要因(kobetsu1, kobetsu2, kobetsu3, kobetsu4)
# 学習用データの個別要因(kobetsu1, kobetsu2, kobetsu3, kobetsu4)
df_kobetsu = pd.DataFrame()

df_kobetsu = pd.concat(objs=[df_kobetsu, df['kobetsu1']], axis=1)
df_kobetsu = pd.concat(objs=[df_kobetsu, df['kobetsu2']], axis=1)
df_kobetsu = pd.concat(objs=[df_kobetsu, df['kobetsu3']], axis=1)
df_kobetsu = pd.concat(objs=[df_kobetsu, df['kobetsu4']], axis=1)
df = df.drop('kobetsu1', axis=1)
df = df.drop('kobetsu2', axis=1)
df = df.drop('kobetsu3', axis=1)
df = df.drop('kobetsu4', axis=1)

df_kobetsu = df_kobetsu.fillna(value=0)

# 評価用データの個別要因(kobetsu1, kobetsu2, kobetsu3, kobetsu4)
dftest_kobetsu = pd.DataFrame()

dftest_kobetsu = pd.concat(objs=[dftest_kobetsu, dftest['kobetsu1']], axis=1)
dftest_kobetsu = pd.concat(objs=[dftest_kobetsu, dftest['kobetsu2']], axis=1)
dftest_kobetsu = pd.concat(objs=[dftest_kobetsu, dftest['kobetsu3']], axis=1)
dftest_kobetsu = pd.concat(objs=[dftest_kobetsu, dftest['kobetsu4']], axis=1)
dftest = dftest.drop('kobetsu1', axis=1)
dftest = dftest.drop('kobetsu2', axis=1)
dftest = dftest.drop('kobetsu3', axis=1)
dftest = dftest.drop('kobetsu4', axis=1)

dftest_kobetsu = dftest_kobetsu.fillna(value=0)

# 学習用個別データ(df_kobetsu)の前処理
df_kobetsu_result = pd.DataFrame()

for elem in ['高圧線下','信号近い','信号前','横断歩道前','踏切付近','ごみ置き場前','心理的瑕疵あり','計画道路','地役権有','敷延2ｍ絞りあり','宅内高低差あり','嫌悪施設隣接','アパート南隣','街道沿い','交通量多い','裏道','行き止まり','行き止まり途中','車進入困難','前面道が坂途中','眺望良','床暖房付','エネファーム付','角地','二方路','三方路']:
    df_kobetsu_result[elem] = df_kobetsu.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

# 評価用個別データ(dftest_kobetsu)の前処理
dftest_kobetsu_result = pd.DataFrame()

for elem in ['高圧線下','信号近い','信号前','横断歩道前','踏切付近','ごみ置き場前','心理的瑕疵あり','計画道路','地役権有','敷延2ｍ絞りあり','宅内高低差あり','嫌悪施設隣接','アパート南隣','街道沿い','交通量多い','裏道','行き止まり','行き止まり途中','車進入困難','前面道が坂途中','眺望良','床暖房付','エネファーム付','角地','二方路','三方路']:
    dftest_kobetsu_result[elem] = dftest_kobetsu.apply(lambda xs: 1 if elem in xs.values else 0, axis=1)

df = pd.concat(objs=[df, df_kobetsu_result], axis=1)
dftest = pd.concat(objs=[dftest, dftest_kobetsu_result], axis=1)

del df_kobetsu
del dftest_kobetsu
del df_kobetsu_result
del dftest_kobetsu_result



print(df.shape)
print(dftest.shape)

(6461, 236)
(4273, 236)


In [0]:
#for flag, name, type in zip(df.isnull().any(), df.columns, df.dtypes):
#  if flag == True:
#    print(name)
#    print(type)

In [13]:
# dfとdftestの欠損値が含まれる列を予測用に確保
# 回帰で欠損値を埋める用と分類で欠損値を埋める用
df_preds_cla = pd.DataFrame()
df_preds_reg = pd.DataFrame()
dftest_preds_cla = pd.DataFrame()
dftest_preds_reg = pd.DataFrame()

for i, j, k in zip(df.isnull().any(), df.columns, df.dtypes):  
  if i == True or j == 'road1_sb' or j == 'gk_yoc_tm':
    if k == 'object':
      df_preds_cla = pd.DataFrame(pd.concat(objs=[df_preds_cla, df[j]], axis=1))
      df = df.drop(j, axis=1)
    else:
      df_preds_reg = pd.DataFrame(pd.concat(objs=[df_preds_reg, df[j]], axis=1))
      df = df.drop(j, axis=1)    
      
for i, j, k in zip(dftest.isnull().any(), dftest.columns, dftest.dtypes):
  if i == True or j == 'chiseki_kb_hb' or j == 'tt_mseki_avg_hb':
    if k == 'object':
      dftest_preds_cla = pd.concat(objs=[dftest_preds_cla, dftest[j]], axis=1)
      dftest = dftest.drop(j, axis=1)
    else:
      dftest_preds_reg = pd.concat(objs=[dftest_preds_reg, dftest[j]], axis=1)
      dftest = dftest.drop(j, axis=1)

# 学習用データのobject型である列をint, float型の列に変換
# object型の要素とint, float型の要素は一対一に対応する
#factorize = []

#for i, j in zip(df.columns, df.dtypes):
#    if j == 'object':
#        factorize.append(i)
    
#for i in factorize:
#    id_1, id_2 = df[i].factorize()
#    id_3 = pd.DataFrame({i: id_1})
#    df[i] = id_3

# 評価用データのobject型である列をint, float型の列に変換
# object型の要素とint, float型の要素は一対一に対応する
#factorize = []

#for i, j in zip(dftest.columns, dftest.dtypes):
#    if j == 'object':
#      factorize.append(i)
    
#for i in factorize:
#    id_1, id_2 = dftest[i].factorize()
#    id_3 = pd.DataFrame({i: id_1})
#    dftest[i] = id_3

# onehot encoding
#onehot = []
#for name, type in zip(df.columns, df.dtypes):
#  if type == 'object':
#    onehot.append(name)

#df = pd.get_dummies(df, prefix=onehot)
#dftest = pd.get_dummies(dftest, prefix=onehot)    

# dfとdftestで異なるカラムを削除
common_col = (df.columns & dftest.columns)
# df を共通項でフィルタリング
df = df[common_col]
# dftest を共通項でフィルタリング
dftest = dftest[common_col]

# 文字から数字に変換する方法
from sklearn.preprocessing import LabelEncoder

cat_features = []

for i, j in zip(df.dtypes, df.columns):
  if i == 'object':
    cat_features.append(j)

df = df.fillna(df.mean())
dftest = df.fillna(dftest.mean())
    
for col in cat_features:
  lbl = LabelEncoder()
  df[col] = lbl.fit_transform(list(df[col].values))
  dftest[col] = lbl.fit_transform(list(dftest[col].values))

# 標準化
from sklearn.preprocessing import StandardScaler

num_features = df.columns

for col in num_features:
  scaler = StandardScaler()
  df[col] = scaler.fit_transform(np.array(df[col].values).reshape(-1, 1))
  dftest[col] = scaler.fit_transform(np.array(dftest[col].values).reshape(-1, 1))
  
print(df.shape)
print(dftest.shape)
print(dftest_id.shape)
print(df_keiyaku_pr.shape)
print(df_preds_cla.shape)
print(df_preds_reg.shape)
print(dftest_preds_cla.shape)
print(dftest_preds_reg.shape)

(6461, 129)
(6461, 129)
(4273,)
(6461,)
(6461, 78)
(6461, 29)
(4273, 75)
(4273, 29)


In [14]:
# 予測用に列名を確保
preds_name_cla_list = []
preds_name_reg_list = []

for column in df_preds_cla.columns:
    preds_name_cla_list.append(column)
    
for column in df_preds_reg.columns:
    preds_name_reg_list.append(column)
    
df = pd.concat(objs=[df, df_preds_cla], axis=1)
df = pd.concat(objs=[df, df_preds_reg], axis=1)
dftest = pd.concat(objs=[dftest, dftest_preds_cla], axis=1)
dftest = pd.concat(objs=[dftest, dftest_preds_reg], axis=1)

del df_preds_cla
del df_preds_reg
del dftest_preds_cla
del dftest_preds_reg

print(df.shape)
print(dftest.shape)
print(dftest_id.shape)
print(df_keiyaku_pr.shape)
print(len(preds_name_cla_list))
print(len(preds_name_reg_list))

(6461, 236)
(6461, 233)
(4273,)
(6461,)
78
29


In [15]:
# 欠損値が存在する列を埋める(分類)
for preds_name in preds_name_cla_list:
    #print(preds_name)
    # 欠損値がある行を配列に確保していく
    df_train = pd.DataFrame(df)
    dftest_train = pd.DataFrame(dftest)
    
    df_test_num = []
    dftest_test_num = []

    for line_num, judg in enumerate(df[preds_name].isnull()):
        if judg == True: # 欠損値あり
            df_train = df_train.drop(line_num, axis=0)
            df_test_num.append(line_num)

    for line_num, judg in enumerate(dftest[preds_name].isnull()):
        if judg == True: # 欠損値あり
            dftest_train = dftest_train.drop(line_num, axis=0)
            dftest_test_num.append(line_num)
            
    df_train = df_train.reset_index(drop=True)
    dftest_train = dftest_train.reset_index(drop=True)
    
    if len(df_train.loc[:, preds_name]) != 0:
        # df_train用
        # object型の要素をint, float型に変換する
        # object型の要素を探す
        factorize = []
        factorize.append(preds_name)
        # 0~要素数の種類の数を割り当てる
        for i in factorize:
            id_1, id_2 = df_train[i].factorize()
            id_3 = pd.DataFrame({i: id_1})
            df_train[i] = id_3
        del factorize

    if len(dftest_train.loc[:, preds_name]) != 0:
        # dftest_train用
        # object型の要素をint, float型に変換する
        # object型の要素を探す
        factorize = []
        factorize.append(preds_name)
        # 0~要素数の種類の数を割り当てる
        for i in factorize:
            id_1, id_2 = dftest_train[i].factorize()
            id_3 = pd.DataFrame({i: id_1})
            dftest_train[i] = id_3


    # 予測用に df と dftestの値も変えていく        
    # string を数値に一対一に対応する
    if len(df.loc[:, preds_name]) != 0:
        # df_train用
        # object型の要素をint, float型に変換する
        # object型の要素を探す
        factorize = []
        factorize.append(preds_name)
        # 0~要素数の種類の数を割り当てる
        for i in factorize:
            id_1, id_2 = df[i].factorize()
            id_3 = pd.DataFrame({i: id_1})
            df[i] = id_3
        del factorize

    if len(dftest.loc[:, preds_name]) != 0:
        # dftest_train用
        # object型の要素をint, float型に変換する
        # object型の要素を探す
        factorize = []
        factorize.append(preds_name)
        # 0~要素数の種類の数を割り当てる
        for i in factorize:
            id_1, id_2 = dftest[i].factorize()
            id_3 = pd.DataFrame({i: id_1})
            dftest[i] = id_3
        
    from sklearn.ensemble import RandomForestClassifier
    import xgboost as xgb
    import lightgbm as lgb
    
    #clf = lgb.LGBMClassifier()
    clf = RandomForestClassifier()
    #clf = xgb.XGBClassifier()
    
    X = pd.DataFrame(pd.concat(objs=[df_train.iloc[:, :df_train.columns.get_loc(preds_name)], dftest_train.iloc[:, :dftest_train.columns.get_loc(preds_name)]], axis=0))
    T = pd.DataFrame(pd.concat(objs=[df_train[preds_name], dftest_train[preds_name]], axis=0))
    
    clf.fit(X, T)
    
    if len(df_test_num) != 0:
      #for num in df_test_num:
      #  df.loc[num, preds_name] = clf.predict(pd.DataFrame([df.iloc[num, :df.columns.get_loc(preds_name)]]))[0]
      df.loc[df_test_num, preds_name] = clf.predict(df.iloc[df_test_num, :df.columns.get_loc(preds_name)])
      
    if len(dftest_test_num) != 0:
      #for num in dftest_test_num:
      #  dftest.loc[num, preds_name] = clf.predict(pd.DataFrame([dftest.iloc[num, :dftest.columns.get_loc(preds_name)]]))[0]
      dftest.loc[dftest_test_num, preds_name] = clf.predict(dftest.iloc[dftest_test_num, :dftest.columns.get_loc(preds_name)])  
    
print(df.shape)
print(dftest.shape)
print(dftest_id.shape)
print(df_keiyaku_pr.shape)



KeyError: ignored

In [0]:
# 欠損値が存在する列を埋める(回帰)
for preds_name in preds_name_reg_list:
    # 欠損値がある行を配列に確保していく
    df_train = pd.DataFrame(df)
    dftest_train = pd.DataFrame(dftest)
    
    df_test_num = []
    dftest_test_num = []

    for line_num, judg in enumerate(df[preds_name].isnull()):
        if judg == True: # 欠損値あり
            df_train = df_train.drop(line_num, axis=0)
            df_test_num.append(line_num)

    for line_num, judg in enumerate(dftest[preds_name].isnull()):
        if judg == True: # 欠損値あり
            dftest_train = dftest_train.drop(line_num, axis=0)
            dftest_test_num.append(line_num)
            
    df_train = df_train.reset_index(drop=True)
    dftest_train = dftest_train.reset_index(drop=True)
    
    if len(df_train.loc[:, preds_name]) != 0:
        # df_train用
        # object型の要素をint, float型に変換する
        # object型の要素を探す
        factorize = []
        factorize.append(preds_name)
        # 0~要素数の種類の数を割り当てる
        for i in factorize:
            id_1, id_2 = df_train[i].factorize()
            id_3 = pd.DataFrame({i: id_1})
            df_train[i] = id_3
        del factorize

    if len(dftest_train.loc[:, preds_name]) != 0:
        # dftest_train用
        # object型の要素をint, float型に変換する
        # object型の要素を探す
        factorize = []
        factorize.append(preds_name)
        # 0~要素数の種類の数を割り当てる
        for i in factorize:
            id_1, id_2 = dftest_train[i].factorize()
            id_3 = pd.DataFrame({i: id_1})
            dftest_train[i] = id_3



    # 予測用に df と dftestの値も変えていく        
    # string を数値に一対一に対応する
    if len(df.loc[:, preds_name]) != 0:
        # df_train用
        # object型の要素をint, float型に変換する
        # object型の要素を探す
        factorize = []
        factorize.append(preds_name)
        # 0~要素数の種類の数を割り当てる
        for i in factorize:
            id_1, id_2 = df[i].factorize()
            id_3 = pd.DataFrame({i: id_1})
            df[i] = id_3
        del factorize

    if len(dftest.loc[:, preds_name]) != 0:
        # dftest_train用
        # object型の要素をint, float型に変換する
        # object型の要素を探す
        factorize = []
        factorize.append(preds_name)
        # 0~要素数の種類の数を割り当てる
        for i in factorize:
            id_1, id_2 = dftest[i].factorize()
            id_3 = pd.DataFrame({i: id_1})
            dftest[i] = id_3
     
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.ensemble import RandomForestRegressor
    import xgboost as xgb
    import lightgbm as lgb
    
    #clf = lgb.LGBMRegressor()
    clf = RandomForestRegressor()
    #clf = xgb.XGBRegressor()
    #clf = GradientBoostingRegressor()
    
    X = pd.DataFrame(pd.concat(objs=[df_train.iloc[:, :df_train.columns.get_loc(preds_name)], dftest_train.iloc[:, :dftest_train.columns.get_loc(preds_name)]], axis=0))
    T = pd.DataFrame(pd.concat(objs=[df_train[preds_name], dftest_train[preds_name]], axis=0))
    clf.fit(X, T)

    if len(df_test_num) != 0:
        df.loc[df_test_num, preds_name] = clf.predict(df.iloc[df_test_num, :df.columns.get_loc(preds_name)])

    if len(dftest_test_num) != 0:
        dftest.loc[dftest_test_num, preds_name] = clf.predict(dftest.iloc[dftest_test_num, :dftest.columns.get_loc(preds_name)])
    
print(df.shape)
print(dftest.shape)
print(dftest_id.shape)
print(df_keiyaku_pr.shape)

In [0]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

#df = df.reset_index(drop=True)
#dftest = dftest.reset_index(drop=True)
#df_keiyaku_pr = df_keiyaku_pr.reset_index(drop=True)
#dftest_id = dftest_id.reset_index(drop=True)

# 次元削減
#x = df.as_matrix()
#y = df_keiyaku_pr.as_matrix()

#lda = LDA(n_components=2)
#lda.fit(x, y)
#x = lda.transform(x)

# dfとdftestで異なるカラムを削除
common_col = (df.columns & dftest.columns)
# df を共通項でフィルタリング
df = df[common_col]
# dftest を共通項でフィルタリング
dftest = dftest[common_col]
    
print(df.shape)
print(dftest.shape)

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
import xgboost as xgb
import lightgbm as lgb

X = df.as_matrix()
y = df_keiyaku_pr.as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

#mod = lgb.LGBMRegressor()

#parameters = {'max_depth': list(range(15, 20)),
#              'criterion': ['gini', 'entropy'],}

#model = GridSearchCV(mod,
#                    parameters,
#                    cv=5)
#model = Lasso()
#model = RandomForestRegressor()
#model = LinearRegression()
#model = xgb.XGBRegressor()
#model = lgb.LGBMRegressor()

model = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_depth=17,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=7500, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

model.fit(X_train, y_train)

print('決定係数(train):{:.3f}'.format(model.score(X_train, y_train)))
print('決定係数(test):{:.3f}'.format(model.score(X_test, y_test)))

In [0]:
print(model.best_estimator_)

In [0]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
    
#xgb1 = lgb.LGBMRegressor()
# モデルの作成
#xgb1 = xgb.XGBRegressor(eta=0.01, max_depth=10, subsumple=5)

#clf = RandomForestRegressor()
#parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
#              'objective':['reg:linear'],
#              'learning_rate': [.03, 0.05, .07], #so called `eta` value
#              'max_depth': [5, 6, 7],
#              'min_child_weight': [4],
#              'silent': [1],
#              'subsample': [0.7],
#             'colsample_bytree': [0.7],
#              'n_estimators': [500]}

#clf = GridSearchCV(xgb1,
#                  parameters,
#                   cv = 2,
#                   n_jobs = 5,
#                   verbose=True)



# 説明変数の設定
#X = df.as_matrix()
#X = x

# 目的変数の設定
#Y = df_keiyaku_pr.as_matrix()

# 学習
#clf.fit(X_4, y)

#print(df.shape)
#print(dftest.shape)
#print(dftest_id.shape)
#print(df_keiyaku_pr.shape)

In [0]:
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import preprocessing, linear_model, svm
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

# モデルの作成
#clf = xgb.XGBRegressor()
#clf = RandomForestRegressor()
#clf = lgb.LGBMRegressor()

clf = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              criterion='gini', importance_type='split', learning_rate=0.1,
              max_depth=17, min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
              objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
              silent=True, subsample=1.0, subsample_for_bin=200000,
              subsample_freq=0)

# 説明変数の設定
X = df.as_matrix()

# 目的変数の設定
Y = df_keiyaku_pr.as_matrix()

# 学習
clf.fit(X, Y)

print(df.shape)
print(dftest.shape)
print(dftest_id.shape)
print(df_keiyaku_pr.shape)

In [0]:
#予測
XX = dftest.as_matrix()

YY = clf.predict(XX)

# 予測データ(df_pre)
df_pre = pd.DataFrame(YY)
df_pre.head()

In [0]:
# 提出用に変換
df_pre = df_pre.astype(int)
df_pre = df_pre.round()
df_pre = pd.concat(objs=[dftest_id, df_pre], axis=1)
df_pre.head()

In [0]:
# tsvファイル作成
df_pre.to_csv('test_pre_1.tsv', sep='\t', index=False, header=False)

In [0]:
jj#df.to_csv('train.tsv', sep='\t', index=False, header=False)
#dftest.to_csv('test.tsv', sep='\t', index=False, header=False)
#dftest_id.to_csv('id.tsv', sep='\t', index=False, header=False)
#df_keiyaku_pr.to_csv('keiyaku_pr.tsv', sep='\t', index=False, header=False)

In [0]:
df_pre = df_pre.drop('id', axis=1)
df_pre /= 100000
df_pre = df_pre.astype(int)
df_pre *= 100000
df_pre = pd.concat(objs=[dftest_id, df_pre], axis=1)
df_pre.to_csv('test_pre_3.tsv', sep='\t', index=False, header=False)
df_pre.head()

Unnamed: 0,id,0
0,test_0000,37000000
1,test_0001,23100000
2,test_0002,22000000
3,test_0003,33200000
4,test_0004,35200000


In [0]:
# よー分からん
#from sklearn.decomposition import PCA

#pca = PCA()
#X_pca = pca.fit_transform(df.loc[:, :])
#df_pcs = pd.DataFrame(X_pca, columns=[i for i in range(209)])
#df_pcs

In [0]:
# 寄与率
pca.explained_variance_ratio_

In [0]:
X_2 = df.as_matrix()
y = df_keiyaku_pr.as_matrix()
XX_2 = dftest.as_matrix()

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
est = RandomForestClassifier()
fs  = SelectFromModel(est)
fs.fit(X_2, y)
X_4  = fs.transform(X_2)
XX_4  = fs.transform(XX_2)

In [0]:
# 以下はNNを採用した場合！！！

In [0]:
x_vals = X_4#df.as_matrix()
y_vals= df_keiyaku_pr.as_matrix()
XX = XX_4#dftest.as_matrix()

In [0]:
from tensorflow.python.framework import ops
import tensorflow as tf

# reset the graph for new run
ops.reset_default_graph()

# Create graph session 
sess = tf.Session()

# set batch size for training
batch_size = 50

# make results reproducible
seed = 3
np.random.seed(seed)
tf.set_random_seed(seed)

# Split data into train/test = 80%/20%
#train_indices = np.random.choice(len(x_vals), round(len(x_vals)*0.8), replace=False)
#test_indices = np.array(list(set(range(len(x_vals))) - set(train_indices)))
train_indices = np.random.choice(len(x_vals), round(len(x_vals)), replace=False)
x_vals_train = x_vals[train_indices]
#x_vals_test = x_vals[test_indices]
y_vals_train = y_vals[train_indices]
#y_vals_test = y_vals[test_indices]

In [0]:
# Record training column max and min for scaling of non-training data
train_max = np.max(x_vals_train, axis=0)
train_min = np.min(x_vals_train, axis=0)

# Normalize by column (min-max norm to be between 0 and 1)
def normalize_cols(mat, max_vals, min_vals):
    return (mat - min_vals) / (max_vals - min_vals)

x_vals_train = np.nan_to_num(normalize_cols(x_vals_train, train_max, train_min))
#x_vals_test = np.nan_to_num(normalize_cols(x_vals_test, train_max, train_min))

XX = np.nan_to_num(normalize_cols(XX, train_max, train_min))

x_vals_train.shape

In [0]:
# Define Variable Functions (weights and bias)
def init_weight(shape, st_dev):
    weight = tf.Variable(tf.random_normal(shape, stddev=st_dev))
    return(weight)
    

def init_bias(shape, st_dev):
    bias = tf.Variable(tf.random_normal(shape, stddev=st_dev))
    return(bias)
    
    
# Create Placeholders
x_data = tf.placeholder(shape=[None, 138], dtype=tf.float32)
y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)

In [0]:
# Create a fully connected layer:
def fully_connected(input_layer, weights, biases):
    layer = tf.add(tf.matmul(input_layer, weights), biases)
    return(tf.nn.relu(layer))

def final_connected(input_layer, weights, biases):
    layer = tf.add(tf.matmul(input_layer, weights), biases)
    return(layer)

#--------Create the first layer (50 hidden nodes)--------
weight_1 = init_weight(shape=[138, 50], st_dev=10.0)
bias_1 = init_bias(shape=[50], st_dev=10.0)
layer_1 = fully_connected(x_data, weight_1, bias_1)

#--------Create second layer (25 hidden nodes)--------
weight_2 = init_weight(shape=[50, 5], st_dev=10.0)
bias_2 = init_bias(shape=[5], st_dev=10.0)
layer_2 = fully_connected(layer_1, weight_2, bias_2)


#--------Create third layer (5 hidden nodes)--------
weight_3 = init_weight(shape=[5, 3], st_dev=10.0)
bias_3 = init_bias(shape=[3], st_dev=10.0)
layer_3 = fully_connected(layer_2, weight_3, bias_3)


#--------Create output layer (1 output value)--------
weight_4 = init_weight(shape=[3, 1], st_dev=10.0)
bias_4 = init_bias(shape=[1], st_dev=10.0)
final_output = final_connected(layer_3, weight_4, bias_4)

# Declare loss function (L1)
loss = tf.reduce_mean(tf.abs(y_target - final_output))

# Declare optimizer
my_opt = tf.train.AdamOptimizer(0.1)
train_step = my_opt.minimize(loss)

In [0]:
# Initialize Variables
init = tf.global_variables_initializer()
sess.run(init)

# Training loop
loss_vec = []
test_loss = []
for i in range(50000):
    rand_index = np.random.choice(len(x_vals_train), size=batch_size)
    rand_x = x_vals_train[rand_index]
    rand_y = np.transpose([y_vals_train[rand_index]])
    sess.run(train_step, feed_dict={x_data: rand_x, y_target: rand_y})

    temp_loss = sess.run(loss, feed_dict={x_data: rand_x, y_target: rand_y})
    loss_vec.append(temp_loss)
    
    #test_temp_loss = sess.run(loss, feed_dict={x_data: x_vals_test, y_target: np.transpose([y_vals_test])})
    #test_loss.append(test_temp_loss)
    if (i+1) % 10000 == 0:
        print('Generation: ' + str(i+1) + '. Loss = ' + str(temp_loss)) 
              #+ '. TEST_LOSS0 ' + str(test_temp_loss))

In [0]:
import matplotlib.pyplot as plt

%matplotlib inline
# Plot loss (MSE) over time
plt.plot(loss_vec, 'k-', label='Train Loss')
#plt.plot(test_loss, 'r--', label='Test Loss')
plt.title('Loss (MSE) per Generation')
plt.legend(loc='upper right')
plt.xlabel('Generation')
plt.ylabel('Loss')
plt.show()

In [0]:
YY = sess.run(final_output, feed_dict={x_data: XX})
df_pre = pd.DataFrame(YY)
df_pre.head()

In [0]:

#df_pre = df_pre.astype(int)
df_pre = df_pre.round()
df_pre = pd.concat(objs=[dftest_id, df_pre], axis=1)
df_pre.head()

In [0]:
df_pre.to_csv('test_pre_2.tsv', sep='\t', index=False, header=False)