In [3]:
import os
import sys
import re
import logging

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib
import category_encoders as ce
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

sys.path.append("..")
from utils import update_tracking, log_evaluation, preprocess_df, TabularDataset, EarlyStopping







#################### 
## Load data
#################### 
# 変数名の英訳
train_cols_eng = ["id", "rent", "location", "access", "layout", "age", "direction", "area", "floor",
           "bath_toilet", "kitchen", "broad_com", "facility", "parking", "environment", "structure",
           "contract_period"]
test_cols_eng = ["id", "location", "access", "layout", "age", "direction", "area", "floor",
           "bath_toilet", "kitchen", "broad_com", "facility", "parking", "environment", "structure",
           "contract_period"]

train = pd.read_csv("../data/train.csv", names=train_cols_eng, header=0)
test = pd.read_csv("../data/test.csv", names=test_cols_eng, header=0)

use_cols = []

#################### 
## Preprocess data
#################### 

train_processed = preprocess_df(train)
test_processed = preprocess_df(test)

# handle outliers
train_processed.drop(20427, axis=0, inplace=True) # 築1019年、どう修正するべきか不明なので
train_processed.loc[20231, "age_year"] = 52
train_processed.loc[20231, "age_in_months"] = 52 * 12 + 5 # 築520年、おそらく52年のタイポと仮定

train_processed.loc[5775, "rent"] = 120350 # 条件からしてありえない高値。おそらくゼロの個数違い
train_processed.loc[20926, "area"] = 43.01 # 条件からしてありえなく広い。おそらくゼロの個数違い


train_processed["ku"] = train_processed["location"].apply(lambda x: re.search("(?<=都).*?区", x).group())
train_processed["group"] = train_processed["ku"] + train_processed["building_floor"].astype(str) \
                    + train_processed["age_in_months"].astype(str) + train_processed["area"].astype(str)

rent_dic = train_processed.groupby("group")["rent"].mean()


test_processed["ku"] = test_processed["location"].apply(lambda x: re.search("(?<=都).*?区", x).group())
test_group = test_processed["ku"] + test_processed["building_floor"].astype(str) \
                    + test_processed["age_in_months"].astype(str) + test_processed["area"].astype(str)

train_processed.reset_index(drop=True, inplace=True)
target = train_processed["rent"]
target_log = np.log1p(target)
train_processed.drop(["id", "rent"], axis=1, inplace=True)
test_processed.drop("id", axis=1, inplace=True)


#################### 
## get feature
#################### 
# モデル学習用データフレーム（category encoderの都合で分ける）
train_use = pd.DataFrame()
test_use = pd.DataFrame()

### location ###
ce_ordinal = ce.OrdinalEncoder(cols=["district"], handle_missing="value")
train_use["district"] = train_processed["district"]
test_use["district"] = test_processed["district"]
train_use = ce_ordinal.fit_transform(train_use)
test_use = ce_ordinal.transform(test_use)

# 緯度経度
geo_csvs = os.listdir("../data/geo/")
geo_csvs = [csv for csv in geo_csvs if "csv" in csv]
loc_dic = {}

for csv in geo_csvs:
    df = pd.read_csv("../data/geo/"+csv, encoding="shift-jis")
    df["loc"] = df["緯度"].astype(str) + "," + df["経度"].astype(str)
    dic = dict(zip(df["大字町丁目名"].values, df["loc"].values))
    loc_dic[df["市区町村名"].unique()[0]] = dic

train_processed["ku"] = train_processed["location"].apply(lambda x: re.search("(?<=都).*?区", x).group())
test_processed["ku"] = test_processed["location"].apply(lambda x: re.search("(?<=都).*?区", x).group())
train_processed["tyou"] = train_processed["location"].apply(lambda x: re.search("(?<=区).*?丁目", x).group() \
                                                            if re.search("(?<=区).*?丁目", x) else np.nan)
test_processed["tyou"] = test_processed["location"].apply(lambda x: re.search("(?<=区).*?丁目", x).group() \
                                                            if re.search("(?<=区).*?丁目", x) else np.nan)

num_map = {"１":"一", "２":"二", "３":"三", "４":"四", "５":"五", "６":"六", "７":"七", "８":"八", "９":"九"}

def convert_number(tyou):
    if pd.isnull(tyou):
        return np.nan
    
    for num in num_map.keys():
        if num in tyou:
            return tyou.replace(num, num_map[num])
        
train_processed["tyou"] = train_processed["tyou"].apply(convert_number)
test_processed["tyou"] = test_processed["tyou"].apply(convert_number)
train_processed["loc_processed"] = train_processed["ku"] + "," + train_processed["tyou"]
test_processed["loc_processed"] = test_processed["ku"] + "," + test_processed["tyou"]

def get_long_lati(loc_processed):
    if pd.isnull(loc_processed):
        return np.nan
    ku, chou = loc_processed.split(",")
    if chou in loc_dic[ku]:
        return loc_dic[ku][chou]
    else:
        return np.nan
    
# 丁目の情報がないのがほとんどnanの原因でいくつかはとってきたcsvにその丁目の情報なし
train_processed["lati_long"] = train_processed["loc_processed"].apply(get_long_lati)
test_processed["lati_long"] = test_processed["loc_processed"].apply(get_long_lati)
train_use["lati"] = train_processed["lati_long"].apply(lambda x: float(x.split(",")[0]) if not pd.isnull(x) else np.nan)
train_use["long"] = train_processed["lati_long"].apply(lambda x: float(x.split(",")[1]) if not pd.isnull(x) else np.nan)
test_use["lati"] = test_processed["lati_long"].apply(lambda x: float(x.split(",")[0]) if not pd.isnull(x) else np.nan)
test_use["long"] = test_processed["lati_long"].apply(lambda x: float(x.split(",")[1]) if not pd.isnull(x) else np.nan)

### access ###
train_use["min_to_nearest_sta"] = train_processed["access_min"].apply(lambda x: min(x) if x else np.nan)
test_use["min_to_nearest_sta"] = test_processed["access_min"].apply(lambda x: min(x) if x else np.nan)

train_use["num_sta"] = train_processed["access_sta"].apply(lambda x: len(x))
test_use["num_sta"] = test_processed["access_sta"].apply(lambda x: len(x))


# 路線
line_cols = [col for col in train_processed.columns.values if "線" in col or "ライン" in col
                                                or "ライナー" in col or "エクスプレス" in col]

line_cols = [col for col in line_cols if train_processed[col].dropna().sum() > 300]
train_use[line_cols] = train_processed[line_cols]
test_use[line_cols] = test_processed[line_cols]


# 駅
sta_cols = [col for col in train_processed.columns.values if "駅" in col]

sta_cols = [col for col in sta_cols if train_processed[col].dropna().sum() > 300]
train_use[sta_cols] = train_processed[sta_cols]
test_use[sta_cols] = test_processed[sta_cols]



### layout ###
ce_ordinal = ce.OrdinalEncoder(cols=["layout"], handle_missing="value")
train_use["layout"] = train_processed["layout"]
test_use["layout"] = test_processed["layout"]
train_use = ce_ordinal.fit_transform(train_use)
test_use = ce_ordinal.transform(test_use)

layout_cols = ["is_K", "is_R", "is_L", "is_D", "is_S", "num_room"]

train_use[layout_cols] = train_processed[layout_cols]
test_use[layout_cols] = test_processed[layout_cols]



### age ###
age_cols = ["age_year", "age_month", "age_in_months"]
train_use[age_cols] = train_processed[age_cols]
test_use[age_cols] = test_processed[age_cols]

### direction ###
ce_ordinal = ce.OrdinalEncoder(cols=["direction"], handle_missing="value")
train_use["direction"] = train_processed["direction"]
test_use["direction"] = test_processed["direction"]
train_use = ce_ordinal.fit_transform(train_use)
test_use = ce_ordinal.transform(test_use)

direction_cols = ["has_N", "has_S", "has_E", "has_W"]
train_use[direction_cols] = train_processed[direction_cols]
test_use[direction_cols] = test_processed[direction_cols]


### area ###
train_use["area"] = train_processed["area"]
test_use["area"] = test_processed["area"]

train_use["area_per_room"] = train_use["area"] / train_use["num_room"]
test_use["area_per_room"] = test_use["area"] / test_use["num_room"]

### floor ###
train_processed["floor_ratio"] = train_processed["room_floor"] / train_processed["building_floor"]
test_processed["floor_ratio"] = test_processed["room_floor"] / test_processed["building_floor"]

floor_cols = ["has_underground", "room_floor", "building_floor", "floor_ratio"]
train_use[floor_cols] = train_processed[floor_cols]
test_use[floor_cols] = test_processed[floor_cols]

### bath_toilet ###
bath_toilet_cols = ["シャワー", "バスなし", "バス・トイレ別", "共同トイレ", "共同バス",
                    "専用トイレ", "専用バス", "洗面台独立", "浴室乾燥機", "温水洗浄便座", "脱衣所", "追焚機能"]

train_use[bath_toilet_cols] = train_processed[bath_toilet_cols]
test_use[bath_toilet_cols] = test_processed[bath_toilet_cols]


### kitchen ###
kitchen_cols = ["IHコンロ", "L字キッチン", "カウンターキッチン", "ガスコンロ", "コンロ1口", "コンロ2口", "コンロ3口",
                 "コンロ4口以上", "コンロ設置可（コンロ1口）", "コンロ設置可（コンロ2口）", "コンロ設置可（コンロ3口）",
                "コンロ設置可（コンロ4口以上）", "コンロ設置可（口数不明）", "システムキッチン", "冷蔵庫あり", "独立キッチン",
                  "給湯", "電気コンロ"]

train_use[kitchen_cols] = train_processed[kitchen_cols]
test_use[kitchen_cols] = test_processed[kitchen_cols]


### broad_com ###
broad_com_cols = ["BSアンテナ", "CATV", "CSアンテナ", "インターネット使用料無料",
                 "インターネット対応", "光ファイバー", "有線放送", "高速インターネット"]

train_use[broad_com_cols] = train_processed[broad_com_cols]
test_use[broad_com_cols] = test_processed[broad_com_cols]


### facility ###
facility_cols = ["24時間換気システム", "2面採光",
                 "3面採光", "ウォークインクローゼット", "エアコン付", "エレベーター", "オール電化", "ガスその他",
                "ガス暖房", "クッションフロア", "シューズボックス", "タイル張り", "トランクルーム", "バリアフリー",
                 "バルコニー", "フローリング", "プロパンガス", "ペアガラス", "ルーフバルコニー", "ロフト付き", "下水",
                "二世帯住宅", "二重サッシ", "井戸", "公営水道", "冷房", "出窓", "地下室", "室内洗濯機置場",
                 "室外洗濯機置場", "専用庭", "床下収納", "床暖房", "排水その他", "敷地内ごみ置き場", "水道その他",
                "汲み取り", "洗濯機置場なし", "浄化槽", "石油暖房", "都市ガス", "防音室"]

train_use[facility_cols] = train_processed[facility_cols]
test_use[facility_cols] = test_processed[facility_cols]


### parking ### 
parking_cols = ["bicycle_parking", "car_parking", "bike_parking"]
train_use[parking_cols] = train_processed[parking_cols]
test_use[parking_cols] = test_processed[parking_cols]


### environment ###
env_cols = ["デパート", "公園",
             "郵便局", "コインパーキング", "学校", "図書館", "飲食店", "月極駐車場", "銀行", "小学校",
             "ドラッグストア", "レンタルビデオ", "病院", "総合病院", "コンビニ", "大学", "幼稚園・保育園",
            "スーパー", "クリーニング"]

train_use[env_cols] = train_processed[env_cols]
test_use[env_cols] = test_processed[env_cols]


### structure ###
ce_ordinal = ce.OrdinalEncoder(cols=["structure"], handle_missing="value")
train_use["structure"] = train_processed["structure"]
test_use["structure"] = test_processed["structure"]
train_use = ce_ordinal.fit_transform(train_use)
test_use = ce_ordinal.transform(test_use)

### contract_period ###
period_cols = ["fixed_term", "contract_period_year", "contract_period_month", "contract_period_in_months"]
train_use[period_cols] = train_processed[period_cols]
test_use[period_cols] = test_processed[period_cols]


# nan handling
for col in train_use.columns.values:
    train_use[col].fillna(-1, inplace=True)
    test_use[col].fillna(-1, inplace=True)




In [4]:
# scaling 
categorical_cols = ["district", "layout", "direction", "structure"]
con_cols = [col for col in train_use.columns if col not in categorical_cols]

X_train, X_val, y_train, y_val = train_test_split(train_use, target_log, test_size=0.2, random_state=42)

In [5]:
X_train.isnull().sum()

district                     0
lati                         0
long                         0
min_to_nearest_sta           0
num_sta                      0
つくばエクスプレス                    0
中央線                          0
丸ノ内線                         0
京急本線                         0
京急空港線                        0
京成押上線                        0
京成本線                         0
京成金町線                        0
京浜東北線                        0
京王井の頭線                       0
京王新線                         0
京王線                          0
京葉線                          0
副都心線                         0
千代田線                         0
半蔵門線                         0
南北線                          0
埼京線                          0
小田急小田原線                      0
山手線                          0
常磐線                          0
日暮里舎人ライナー                    0
日比谷線                         0
有楽町線                         0
東急世田谷線                       0
                            ..
石油暖房                         0
都市ガス    

In [6]:
sc = StandardScaler()

In [7]:
# scaling 
categorical_cols = ["district", "layout", "direction", "structure"]
con_cols = [col for col in train_use.columns if col not in categorical_cols]

In [8]:
pd.DataFrame(sc.fit_transform(X_train[con_cols]), columns=con_cols).isnull().sum()

lati                         0
long                         0
min_to_nearest_sta           0
num_sta                      0
つくばエクスプレス                    0
中央線                          0
丸ノ内線                         0
京急本線                         0
京急空港線                        0
京成押上線                        0
京成本線                         0
京成金町線                        0
京浜東北線                        0
京王井の頭線                       0
京王新線                         0
京王線                          0
京葉線                          0
副都心線                         0
千代田線                         0
半蔵門線                         0
南北線                          0
埼京線                          0
小田急小田原線                      0
山手線                          0
常磐線                          0
日暮里舎人ライナー                    0
日比谷線                         0
有楽町線                         0
東急世田谷線                       0
東急多摩川線                       0
                            ..
浄化槽                          0
石油暖房    

In [15]:
pd.DataFrame(sc.fit_transform(X_train[con_cols]), columns=con_cols).tail()

Unnamed: 0,lati,long,min_to_nearest_sta,num_sta,つくばエクスプレス,中央線,丸ノ内線,京急本線,京急空港線,京成押上線,...,総合病院,コンビニ,大学,幼稚園・保育園,スーパー,クリーニング,fixed_term,contract_period_year,contract_period_month,contract_period_in_months
25170,0.274553,0.274257,1.666795,0.415198,-0.125912,-0.174459,-0.250487,-0.176404,-0.04611,9.412272,...,-0.187109,0.427924,-0.146754,-0.152445,-0.453077,-0.049802,0.344211,0.487561,0.321157,0.453893
25171,0.266209,0.270504,-1.30601,0.415198,-0.125912,-0.174459,-0.250487,-0.176404,-0.04611,-0.105097,...,-0.187109,0.151364,-0.146754,-0.152445,-0.29465,-0.049802,0.344211,0.487561,0.321157,0.453893
25172,0.271155,0.272298,-0.66898,-1.091811,-0.125912,-0.174459,-0.250487,-0.176404,-0.04611,-0.105097,...,-0.191096,-0.738933,-0.150666,-0.158696,-0.852192,-0.070456,-1.657986,-1.782665,-1.288463,-1.717653
25173,0.266991,0.27194,-0.244294,0.415198,-0.125912,-0.174459,-0.250487,-0.176404,-0.04611,-0.105097,...,-0.191096,-0.738933,-0.150666,-0.158696,-0.852192,-0.070456,-1.657986,-1.782665,-1.288463,-1.717653
25174,0.281075,0.273847,2.728512,-2.59882,-0.125912,-0.174459,-0.250487,-0.176404,-0.04611,-0.105097,...,-0.191096,-0.738933,-0.150666,-0.158696,-0.852192,-0.070456,0.344211,0.487561,0.321157,0.453893


In [17]:
X_train[categorical_cols].reset_index(drop=False).tail()

Unnamed: 0,index,district,layout,direction,structure
25170,29802,5,2,1,3
25171,5390,3,1,3,1
25172,860,2,6,3,1
25173,15795,2,3,4,1
25174,23654,19,1,3,3


In [12]:
tmp = pd.concat([X_train[categorical_cols],
                    pd.DataFrame(sc.fit_transform(X_train[con_cols]), columns=con_cols)], axis=1)

In [14]:
tmp.tail()

Unnamed: 0,district,layout,direction,structure,lati,long,min_to_nearest_sta,num_sta,つくばエクスプレス,中央線,...,総合病院,コンビニ,大学,幼稚園・保育園,スーパー,クリーニング,fixed_term,contract_period_year,contract_period_month,contract_period_in_months
31463,22.0,1.0,6.0,3.0,,,,,,,...,,,,,,,,,,
31464,10.0,4.0,3.0,2.0,,,,,,,...,,,,,,,,,,
31465,20.0,2.0,6.0,1.0,,,,,,,...,,,,,,,,,,
31467,16.0,5.0,3.0,3.0,,,,,,,...,,,,,,,,,,
31468,17.0,1.0,1.0,2.0,,,,,,,...,,,,,,,,,,


In [None]:
# scaling 
categorical_cols = ["district", "layout", "direction", "structure"]
con_cols = [col for col in train_use.columns if col not in categorical_cols]

X_train, X_val, y_train, y_val = train_test_split(train_use, target_log, test_size=0.2, random_state=42)

sc = StandardScaler()
train_use = pd.concat([X_train[categorical_cols],\
                    pd.DataFrame(sc.fit_transform(X_train[con_cols]), columns=con_cols)], axis=1)
val_use = pd.concat([X_val[categorical_cols],\
                    pd.DataFrame(sc.transform(X_val[con_cols]), columns=con_cols)], axis=1)
test_use = pd.concat([test_use[categorical_cols],\
                    pd.DataFrame(sc.transform(test_use[con_cols]), columns=con_cols)], axis=1)
