## 賃料についてのドメイン知識
#### たいがのコードをちょい改変
- 線路を出現頻度30までは残す
- 最寄り駅以外も残す
- 駅によるかさ増し（行方向へ展開はしない）は行わない
- 駅の正規表現などを少し追加



- https://lobotomy-project.org/marketpricerent/
- https://ameblo.jp/enjoytokyolovelivelife/entry-11084920720.html


In [2]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
from scipy.stats import norm, skew
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import matplotlib
font = {'family': 'Yu Mincho'}
matplotlib.rc('font', **font)

pd.set_option('max_columns',1000)
pd.set_option('max_rows',1000)

import warnings
warnings.filterwarnings('ignore')

import re
import geocoder
from geopy.distance import great_circle, vincenty
from tqdm import tqdm
import jaconv
import os
import gc
# print(os.listdir("././input"))
# print(os.listdir("././submit"))

SEED=1234
n_splits=10

### 所在地

In [3]:
def get_location_lonlat(df):
    '''
    緯度経度取得関数
    '''
    loc = pd.read_table('./input/location.txt',delimiter=',')
    loc = loc.rename(columns={'location':'所在地'})
    
    ## -の正規化
    df['所在地'] = df['所在地'].apply(lambda x: x.replace('−','－')).apply(lambda x: jaconv.z2h(x,digit=True,ascii=True).replace('(','').replace(')',''))
    loc['所在地'] = loc['所在地'].apply(lambda x: x.replace('−','－')).apply(lambda x: jaconv.z2h(x,digit=True,ascii=True).replace('(','').replace(')',''))
    
    ## lon, lat取得
    lat_map = loc.groupby('所在地')[' lat'].max()
    lon_map = loc.groupby('所在地')[' lon'].max()
    df['loc_lat'] = df['所在地'].map(lat_map)
    df['loc_lon'] = df['所在地'].map(lon_map)
    
    return df

def get_center_dis(df):
    '''
    国会議事堂からの距離を求める関数
    '''
    df = get_location_lonlat(df)
    df['center_dis'] = df.apply(lambda x: vincenty((x['loc_lat'],x['loc_lon']),(35.6759323, 139.7450316)).meters, axis=1)
    return df

In [4]:
def get_city(df):
    '''
    区を抽出
    '''
    ## 市町村をわける
    siku = ["千代田区","中央区","港区","新宿区","文京区","台東区","墨田区","江東区","品川区","目黒区","大田区","世田谷区","渋谷区","中野区",
            "杉並区","豊島区","北区","荒川区","板橋区","練馬区","足立区","葛飾区","江戸川区","八王子市","立川市","武蔵野市","三鷹市","青梅市",
            "府中市","昭島市","調布市","町田市","小金井市","小平市","日野市","東村山市","国分寺市","国立市","福生市","狛江市","東大和市",
            "清瀬市","東久留米市","武蔵村山市","多摩市","稲城市","羽村市","あきる野市","西東京市","瑞穂町","日の出町","檜原村","奥多摩町",
            "大島町","利島村","新島村","神津島村","三宅村","御蔵島村","八丈町","青ヶ島村","小笠原村",]
    
    df['city'] = df['所在地'].apply(lambda x: x.replace('東京都',''))
    df['city'] = df['city'].apply(lambda x: [s for s in siku if s in x][0])
    
    ## 市区町村を抜く
    df['city2'] = df['所在地'].apply(lambda x: x.replace('東京都',''))
    for s in siku:
        df['city2'] = df['city2'].apply(lambda x: x.replace(s,''))   
        
    df['city2'] = df['city2'].apply(lambda x: re.split('\d+',x)[0])
    
    
    return df

### 間取り

In [5]:
def get_floor_plan(df):
    '''
    間取りを詳しく分けたものを取得
    '''
    df['納戸'] = df['間取り'].apply(lambda x: 1 if '納戸' in x else 0)
    
    df['間取り'] = df['間取り'].apply(lambda x: x.replace('(納戸)','').replace('+S',''))
    df['間取り'] = df['間取り'].str.replace('LK', 'LDK')
    
    df['部屋数'] = df['間取り'].apply(lambda x:int(re.sub('\\D', '', x)))
    df['L'] = df['間取り'].apply(lambda x:1 if 'L' in x else 0)
    df['D'] = df['間取り'].apply(lambda x:1 if 'D' in x else 0)
    df['K'] = df['間取り'].apply(lambda x:1 if 'K' in x else 0)
    
    df['LDK'] = df['間取り'].apply(lambda x:1 if 'LDK' in x else 0)
    df['DK'] = df['間取り'].apply(lambda x:1 if 'DK' in x else 0)
    #df['R'] = df['間取り'].apply(lambda x:1 if 'R' in x else 0)
    
    return df

### 築年数

In [6]:
def get_age(df):
    '''
    築年数に関して、新築フラグと築年数を求める
    '''
    ## 新築フラグ
    df['新築'] = df['築年数'].apply(lambda x:1 if x=='新築' else 0)
    
    ## 新築を0年0カ月に
    df['築年数'] = df['築年数'].apply(lambda x:'0年0ヶ月' if x=='新築' else x)
    
    ## 築年数を月計算
    df['築年数'] = df['築年数'].apply(lambda x: int(re.sub('\\D', '', x.split('年')[0]))*12 + int(re.sub('\\D', '', x.split('年')[1])))
    
    return df

### 面積

In [7]:
def get_area(df):
    '''
    面積と畳計算したものを取得
    '''
    df['面積'] = df['面積'].apply(lambda x:x[:-2]).astype(float)
    
    
    ## 畳計算
    df['畳'] = df['面積'].apply(lambda x: int(x/1.45))
    
    return df

### 所在階

In [8]:
# 所在階の処理
def get_floor(data):
    '''
    戸建て・地下フラグ、所在階、総階数を入れる
    '''
    
    df['戸建て'] = df['所在階'].apply(lambda x: 0 if '／' in str(x) else 1)
    
    # 所在階と合計階数と地下階数に分ける
    df['総階数'] = [int(re.findall('(\d+階建)', i)[0][:-2]) if len(re.findall('(\d+階建)', i)) != 0 else np.nan  for i in df['所在階'].astype(str)]
    df['地下階数'] = [int(re.findall('(地下\d+階)', i)[0][2:-1]) if len(re.findall('(地下\d+階)', i)) != 0 else 0  for i in df['所在階'].astype(str)]
    df['所在階'] = [int(re.findall('(\d+階[^建][^)])', i)[0][:-3]) if len(re.findall('(\d+階[^建][^)])', i)) != 0 else np.nan  for i in df['所在階'].astype(str)]
    
    return df

### 契約年数

In [9]:
def get_construct(df):
    '''
    契約期間が「まで」かどうか・借家のダミーと契約期間の月数を取得
    '''
    ## 欠損値を埋める
    df['契約期間'] = df['契約期間'].fillna('-')
    
    ## 借家フラグ
    df['借家'] = df['契約期間'].apply(lambda x: 1 if '定期借家' in x else 0)
    
    # までダミー
    df['契約期間(まで)'] = df['契約期間'].apply(lambda x: 1 if 'まで' in x else 0)
    
    ## 契約期間の月数
    df['契約期間'] = df['契約期間'].apply(lambda x:x.split()[0])
    df['契約期間'] = df['契約期間'].apply(lambda x: 12*int(x.split('年間')[0]) if '年間' in x
                                         else 12*int(x.split('年')[0])+int(re.sub('\\D', '', x.split('年')[1])) if ('月間' in x) and ('年' in x)
                                         else int(re.sub('\\D','',x)) if '月間' in x
                                         else 12*(int(re.sub('\\D', '', x.split('年')[0]))-2019)+(int(re.sub('\\D', '', x.split('年')[1]))-10)  
                                                 if ('まで' in x)and (int(re.sub('\\D', '', x.split('年')[1]))-10>=0)
                                         else 12*(int(re.sub('\\D', '', x.split('年')[0]))-2020)+13+(int(re.sub('\\D', '', x.split('年')[1]))-10)
                                                 if  'まで' in x
                                         else x)
    df['契約期間'] = df['契約期間'].apply(lambda x: int(x) if x!='-' else 0)
    
    return df

### バストイレ

In [10]:
def get_BathToilet(df):
    '''
    バス・トイレの機能を全てダミー変数に
    '''
    
    ## 欠損値を埋める
    df['バス・トイレ'].fillna('-',inplace=True)
    
    ## 何が含まれているのか
    b_t = set()
    bath = df['バス・トイレ'].apply(lambda x:str(x).split('／')).values
    for lis in bath:
        for i in lis:
            i = i.replace('\t','')
            b_t.add(i)

    ## それぞれにフラグを立てる
    for bt in b_t:
        df[bt] = df['バス・トイレ'].apply(lambda x: 1 if bt in x else 0)
    
    return df

### キッチン

In [11]:
def get_kitchen(df):
    '''
    キッチン機能のフラグ
    '''
    ## 欠損値を埋める
    df['キッチン'].fillna('-',inplace=True)
    
    kit = set()
    kitchen = df['キッチン'].apply(lambda x:str(x).split('／')).values
    for lis in kitchen:
        for i in lis:
            i = i.replace('\t','')
            kit.add(i)

    ## それぞれにフラグを立てる
    for k in kit:
        if k=='-':
            continue
        df[k] = df['キッチン'].apply(lambda x: 1 if k in x else 0)
    
    return df

### 放送・通信

In [12]:
def get_broadcast(df):
    '''
    放送・通信のダミー取得
    '''
    ## 欠損値を埋める
    df['放送・通信'].fillna('-',inplace=True)
    
    ## 何が含まれているのか
    bro = set()
    broad = df['放送・通信'].apply(lambda x:str(x).split('／')).values
    for lis in broad:
        for i in lis:
            i = i.replace('\t','')
            bro.add(i)
        
    ## それぞれにフラグを立てる
    for br in bro:
        if br=='-':
            continue
        df[br] = df['放送・通信'].apply(lambda x: 1 if br in x else 0)
    
    return df

### 周辺環境

In [13]:
def get_env(df):
    '''
    周辺環境施設までの距離と個数
    '''
    ## 欠損値を埋める
    df['周辺環境'].fillna('-',inplace=True)
    
    ## 何が含まれているのか
    en = set()
    env = df['周辺環境'].apply(lambda x:str(x).split('\t')).values
    for lis in env:
        for i in lis:
            if i =='-':
                continue
            i = re.findall('【.*】',i)[0]
            en.add(i)
    
    for e in en:
        ## 周辺環境との最短距離
        df[str(e)+'_dis'] = df['周辺環境'].apply(lambda x: int(str(x).split()[str(x).split().index(e)+1].replace('m','')) if e in str(x).split() else 0)     
        ## 周辺環境の個数
        df[str(e)+'_num'] = df['周辺環境'].apply(lambda x: str(x).split().count(e))
        
    return df

### 室内設備

In [14]:
def get_facilities(df):
    '''
    室内設備のダミー
    '''
    
    ## 欠損値を埋める
    df['室内設備'].fillna('-',inplace=True)
    
    ## 何が含まれているのか
    fac = set()
    facility = df['室内設備'].apply(lambda x:str(x).split()).values
    for lis in facility:
        for i in lis:
            i = i.replace('／','')
            fac.add(i)

    ## それぞれにフラグを立てる
    fe = []
    for f in fac:
        if f =='-' or f=='':
            continue
        df[f] = df['室内設備'].apply(lambda x: 1 if f in x else 0)
    
    return df

### 駐車場

In [15]:
def get_park(df):
    '''
    駐車場に関して
    '''
    ## 欠損値を埋める
    df['駐車場'].fillna('-',inplace=True)
    
    park = ['駐輪場','駐車場','バイク置き場']
    
    for p in park:
        ## 駐車場の状況を
        df[p] = df['駐車場'].apply(lambda x: x.split()[x.split().index(p)+1].split('(')[0] if p in x.split() else '-')
        
    return df

### アクセス

In [16]:
import mojimoji
def get_access(data,in_col='アクセス',out_cols=['線名','駅名']):
    data2=data.copy()
    main_line= ['山手線', '中央線', '大江戸線', '丸ノ内線', '三田線', '新宿線', '総武線', '東西線', '有楽町線',
   '日比谷線', '西武池袋線', '京王線', '西武新宿線', '東武東上線', '浅草線', '京浜東北線', '千代田線', '本線',
   '南北線', '小田急小田原線', '田園都市線', '京王井の頭線', '池上線', '東横線', '京成本線', '大井町線',
   '半蔵門線', '常磐線', '世田谷線', '目黒線']
    def separate_all(x):
        list_ = re.split('\t|\u3000|・|：', x)
        #カッコとカッコ内の言葉を除
        list_ = [re.sub('\(.*?\)|\（.*?\）', '', x) for x in list_]
        list_ =[re.sub('丸の内', '丸ノ内', x) for x in list_]
        list_ = [mojimoji.zen_to_han(x, kana=False) for x in list_ if x]
        return list_

    def extract_rails(List, word='線'):

        New_list=[x.split(word)[0] + word for x in List if re.search(word, x)]
        New_list=[re.sub('東京都|JR|東急|都営|京急|東急メトロ|東京メトロ|小田急電鉄|','',x) for x in New_list]
        New_list=[x for x in New_list if x in main_line]

        return New_list

    def extract_stations(List, word='駅'):
        List = [x.split(word)[0] + word for x in List if re.search(word, x)]
        New_List = []
        for x in List:
            if re.search('線', x):
                x = x.split('線')[1]
            if re.search('(＼|バス)', x):
                x = re.split('(＼|バス)', x)[1]
            x=x.replace('バス','').replace('東京臨海高速鉄道','').replace('』','').replace('『','')\
                          .replace('ゆりかもめ','').replace('001/','').replace('JR','').replace('「','').replace('」','')\
                          .replace('JR','').replace('つくばエクスプレス','').replace('日暮里舎人ライナー','').replace('見01/','')\
                          .replace('見01/','').replace('ステーション駅','').replace('錦22/','').replace('.','')\
                          .replace('ヶ','ケ').replace('京急','').replace('桜上上水駅 ','桜上水駅 ').replace('興沢駅','奥沢駅')\
                          .replace('東京モノレール整備場駅 ','').replace('京浜','').replace('都電','').replace('亀29/','').replace('東京都','')
            New_List.append(x)
        New_List = [mojimoji.zen_to_han(x, kana=False) for x in New_List if x]
        return New_List
    def check_list(List, word):
        return 1 if word in set(List) else 0
    data2[in_col] = data2[in_col].apply(separate_all)
    data2[out_cols[0]] = data2[in_col].apply(extract_rails, word='線')
    data2[out_cols[1]] = data2[in_col].apply(extract_stations, word='駅')
    for spot in main_line:
        data2[spot] = data2[out_cols[0]].apply(check_list,word=spot)
    return data2

#### 駅の緯度経度追加

In [17]:
def get_station_latlon(df):
    '''
    駅の緯度経度を取得
    '''
    
    ## 駅の緯度経度
    sta = pd.read_table('./input/station2.txt',delimiter=',')
    sta = sta.rename(columns={'station':'駅名'})
    
    ## 最寄り駅の緯度経度追加
    ## lon, latのマッピング
    lat_map = sta.groupby('駅名')['lat'].max()
    lon_map = sta.groupby('駅名')['lon'].max()
    df['sta_lat'] = df['駅名'].map(lat_map)
    df['sta_lon'] = df['駅名'].map(lon_map)
    
    return df

In [18]:
def get_loc_type(data,out_col='loc_type'):
    '''緯度経度による、グリッドを作成'''
    data2=data.copy()
    data[out_col]=0
    for i in range(12):
        for j in range(9):
            data2.loc[data2.query(f'139.569+{i*0.03}<loc_lon<139.569+{(i+1)*0.03}&35.54+{j*0.03}<loc_lat<35.54+{(j+i)*0.03}').index,out_col]=i*12+j
    return data2


In [19]:
def get_manshon(df, in_col='駅名'):

    ## 駅別マンション平均単価
    manshon = pd.read_csv('./input/manshon.csv')
    manshon_map1 = manshon.groupby('駅名')['2012年平均単価'].min()
    manshon_map2 = manshon.groupby('駅名')['2017年平均単価'].min()
    manshon_map3 = manshon.groupby('駅名')['上昇率'].min()
    df['2012年平均単価'] = df[in_col].map(manshon_map1)
    df['2017年平均単価'] = df[in_col].map(manshon_map2)
    df['上昇率'] = df[in_col].map(manshon_map3)

    mean_2012 = df.groupby('id')['2012年平均単価'].mean()
    min_2012 = df.groupby('id')['2012年平均単価'].min()
    max_2012 = df.groupby('id')['2012年平均単価'].max()
    mean_2017 = df.groupby('id')['2017年平均単価'].mean()
    min_2017 = df.groupby('id')['2017年平均単価'].min()
    max_2017 = df.groupby('id')['2017年平均単価'].max()
    mean_up = df.groupby('id')['上昇率'].mean()
    min_up = df.groupby('id')['上昇率'].min()
    max_up = df.groupby('id')['上昇率'].max()
    df['2012平均単価_mean'] = df.id.map(mean_2012)
    df['2012平均単価_max'] = df.id.map(max_2012)
    df['2012平均単価_min'] = df.id.map(min_2012)
    df['2017平均単価_mean'] = df.id.map(mean_2017)
    df['2017平均単価_max'] = df.id.map(max_2017)
    df['2017平均単価_min'] = df.id.map(min_2017)
    df['上昇率_mean'] = df.id.map(mean_up)
    df['上昇率_max'] = df.id.map(max_up)
    df['上昇率_min'] = df.id.map(min_up)

    return df


### データ整形

In [25]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
id_max = train.id.max()
df = pd.concat([train, test]).reset_index(drop=True)
print('訓練データ : ', train.shape)
print('テストデータ : ',test.shape)

訓練データ :  (31470, 17)
テストデータ :  (31262, 16)


In [26]:
###タイガのお手製住所変換
import pickle
with open('./team_data/df_loc.pkl', mode='rb') as f:
    location=pickle.load(f)

df['所在地']=location['所在地']

In [27]:
raw_df=df.copy()

In [28]:
%%time
## 前処理
df=raw_df.copy()
df = get_BathToilet(df)
df = get_access(df)
df = get_age(df)
df = get_area(df)
df = get_broadcast(df)
df = get_center_dis(df)
df = get_city(df)
df = get_construct(df)
df = get_env(df)
df = get_facilities(df)
df = get_floor(df)
df = get_floor_plan(df)
df = get_kitchen(df)


CPU times: user 11 s, sys: 69.9 ms, total: 11.1 s
Wall time: 11 s


In [29]:
from pathlib import Path
if not Path('./team_data/modify_test_target.csv').is_file():
    df['Flag']=0
    df[train.shape[0]:]['Flag']=1

    df['loc_flag'] = df['所在地'].apply(lambda x: 1 if '-' in x else 0)
    df_c = df[df['loc_flag']==1]
    df_e = df[df['loc_flag']==0]

    df_c_tr = df_c[df_c['Flag']==0][['id','所在地','賃料','築年数','所在階','契約期間','方角','面積','2面採光','シャワー'
                                     ,'CATV','BSアンテナ','専用トイレ','敷地内ごみ置き場','脱衣所','専用バス',
                                     'エアコン付','ウォークインクローゼット','光ファイバー','コンロ2口','タイル張り',
                                    'インターネット対応','独立キッチン','【スーパー】_dis']]

    df_c_te = df_c[df_c['Flag']==1][['id','所在地','築年数','所在階','契約期間','方角','面積','2面採光','シャワー',
                                     'CATV','BSアンテナ','専用トイレ','敷地内ごみ置き場','脱衣所','専用バス',
                                    'エアコン付','ウォークインクローゼット','光ファイバー','コンロ2口','タイル張り',
                                    'インターネット対応','独立キッチン','【スーパー】_dis']]               


    count = 0
    for idx in tqdm(df_c_te.index,position=0):
        id_ = df_c_te.loc[idx,'id']
        loc_ = df_c_te.loc[idx,'所在地']
        age_ = df_c_te.loc[idx,'築年数']
        floor_ = df_c_te.loc[idx,'所在階']
        cons_ = df_c_te.loc[idx,'契約期間']
        dir_ = df_c_te.loc[idx,'方角']
        area_ = df_c_te.loc[idx,'面積']
        flash_ = df_c_te.loc[idx,'2面採光']
        shower_ = df_c_te.loc[idx,'シャワー']
        catv_ = df_c_te.loc[idx,'CATV']
        bs_ = df_c_te.loc[idx,'BSアンテナ']
        toi_ = df_c_te.loc[idx,'専用トイレ']
        dust_ = df_c_te.loc[idx,'敷地内ごみ置き場']
        dress_ = df_c_te.loc[idx,'脱衣所']
        bath_ = df_c_te.loc[idx,'専用バス']
        air_ = df_c_te.loc[idx,'エアコン付']
        WiC_ = df_c_te.loc[idx,'ウォークインクローゼット']
        hikari_ = df_c_te.loc[idx,'光ファイバー']
        konro2_ = df_c_te.loc[idx,'コンロ2口']
        tile_ = df_c_te.loc[idx,'タイル張り']
        inter_ = df_c_te.loc[idx,'インターネット対応']
        dkit_ = df_c_te.loc[idx,'独立キッチン']
        super_ = df_c_te.loc[idx,'【スーパー】_dis']


        target = df_c_tr[(df_c_tr['所在地']==loc_)&(df_c_tr['築年数']==age_)&(df_c_tr['所在階']==floor_)&(df_c_tr['契約期間']==cons_)&
                         (df_c_tr['方角']==dir_)&(df_c_tr['面積']==area_)&(df_c_tr['2面採光']==flash_)&
                        (df_c_tr['シャワー']==shower_)&(df_c_tr['CATV']==catv_)&(df_c_tr['BSアンテナ']==bs_)&
                        (df_c_tr['専用トイレ']==toi_)&(df_c_tr['敷地内ごみ置き場']==dust_)&
                        (df_c_tr['脱衣所']==dress_)&(df_c_tr['専用バス']==bath_)&(df_c_tr['エアコン付']==air_)&
                        (df_c_tr['ウォークインクローゼット']==WiC_)&(df_c_tr['光ファイバー']==hikari_)&
                        (df_c_tr['コンロ2口']==konro2_)&(df_c_tr['タイル張り']==tile_)&
                        (df_c_tr['インターネット対応']==inter_)&(df_c_tr['独立キッチン']==dkit_)&
                        (df_c_tr['【スーパー】_dis']==super_)]['賃料'].unique()


        try:
            if len(target)>0:
                df_c_te.loc[df_c_te['id']==id_,'賃料'] = target
                count += 1
        except:
            continue
    print('変更した数',count)
    df_c_te[['id','賃料']].dropna().to_csv('./team_data/modify_test_target.csv')
    df.drop(columns='Flag',inplace=True)

In [30]:
import jaconv
df['所在地'] = df['所在地'].apply(lambda x: jaconv.z2h(x,digit=True,ascii=True).replace('(','').replace(')',''))

df['loc_flag'] = df['所在地'].apply(lambda x: 1 if '-' in x else 0)
mini_df=df[['id','築年数','総階数','面積','所在地','loc_flag']]
df_c = mini_df[mini_df['loc_flag']==1]
df_e = mini_df[mini_df['loc_flag']==0]


for idx in tqdm(df_e.index,position=0):
    id_ = df_e.loc[idx,'id']
    loc_=df_e.loc[idx,'所在地']
    age_=df_e.loc[idx,'築年数']
    floor_=df_e.loc[idx,'総階数']
    area_=df_e.loc[idx,'面積']
    
    df_c['丁目_flag'] = df_c['所在地'].apply(lambda x: 1 if loc_ in x else 0)
    ddd = df_c[df_c['丁目_flag']==1]
    same_loc = ddd[(ddd['築年数']==age_) & (ddd['総階数']==floor_)]['所在地'].unique()
    if len(same_loc)>1:
        same_loc = ddd[(ddd['築年数']==age_) & (ddd['総階数']==floor_)&(ddd['面積']==area_)]['所在地'].unique()
    if len(same_loc)==1:
        df.loc[df['id']==id_,'所在地'] = same_loc

100%|██████████| 23977/23977 [03:56<00:00, 101.54it/s]


### 駅に関する前処理

In [31]:
df = get_location_lonlat(df)
df = get_park(df)
df = get_loc_type(df)

In [32]:
#駅と緯度経度が対応した辞書を作る
sta = pd.read_table('./input/station2.txt',delimiter=',')
sta = sta.rename(columns={'station':'駅名'})
sta.set_index('駅名',inplace=True,drop=True)

sta_dict=sta.to_dict()
def convert_sta_lat(_list):
    new_list=[]
    for x in _list:
        lat = sta_dict['lat'][x] if x in sta_dict['lat'] else np.nan
        lon = sta_dict['lon'][x] if x in sta_dict['lon'] else np.nan
        new_list.append((lat,lon))
    
    return new_list
#駅名リスト列を緯度経度で変換し返す
df['lat_sta']=df['駅名'].apply(convert_sta_lat)

In [33]:
def get_geo_features(lat,lon,station,lat_sta):
    '''複数引数と返り値を持つapply pd,seriesで返す必要がある。
    準備:
    lat,lon...緯度経度列(float)
    station...駅がリストで格納されている列
    lat_sta...station列を緯度経度に変換したリスト列
    緯度経度距離をかえし、統計量を取得する
    '''
    # print(lat,lon,station,lat_sta)
    distances=np.array([])
    for i,x in enumerate(lat_sta):
        if not np.isnan(x[0]) and not np.isnan(lat):
            distances=np.append(distances,vincenty((lat,lon),x).meters)
    # print(distances)
    # distances=np.array([ for x in lat_sta if x[0]!=np.nan])
    _mean=np.mean(distances) if len(distances)>0 else np.nan
    _med=np.median(distances) if len(distances)>0 else np.nan
    _max =np.max(distances) if len(distances)>0 else np.nan
    nearest_station=station[np.argmin(distances)] if len(distances)>0 else np.nan
    second_station= station[distances.argsort()[1]] if len(distances)>1 else np.nan
    thrid_station=station[distances.argsort()[2]] if len(distances)>2 else np.nan

    return pd.Series([distances,_mean,_med,_max,nearest_station,second_station,thrid_station])

df[['distances','sta_mean','sta_min','sta_max','nearest_sta','second_sta','third_sta']]=\
df.apply(lambda x:get_geo_features(x['loc_lat'],x['loc_lon'],x['駅名'],x['lat_sta']),axis=1)

In [34]:
import jaconv
df['所在地'] =  df['所在地'].apply(lambda x: jaconv.z2h(x,digit=True, ascii=True)) 

tika = pd.read_csv('team_data/2019_TAKUCHI_k_13_after.csv',encoding='cp932')
tika = get_city(tika)
tika['丁目'] = tika['所在地'].apply(lambda x: x.split('目')[0]+'目' if '目' in x
                                          else np.nan)


for feature in ['鑑定評価額','1m2当たりの価格']:
    print(feature)
    for key_feature in ['所在地','丁目','city2']:

        tika_map = tika.groupby(key_feature)[feature].mean()
        df[feature+'_'+key_feature] = df['所在地'].map(tika_map)
        if key_feature=='city2':
            df[feature+'_'+key_feature] = df['city2'].map(tika_map)
#         elif key_feature=='駅名':
#             df[feature+'_'+key_feature] = df['駅名'].map(tika_map)

    # 合わせる
    df[feature] = df[feature+'_所在地']
    df.loc[df[feature].isnull(),feature] = df[df[feature].isnull()][feature+'_丁目']
    df.loc[df[feature].isnull(),feature] = df[df[feature].isnull()][feature+'_city2']
    df.loc[df[feature].isnull(),feature] = df[df[feature].isnull()][feature+'_所在地']

鑑定評価額
1m2当たりの価格


In [35]:
def get_landprice(df, k=50):
    '''
    地価のk近傍点を取得
    '''
    import geopandas as gpd
    from sklearn.neighbors import KNeighborsRegressor #NearestNeighborsでもよいけど面倒だからそのまま

    # 地価データを取得して、緯度経度と地価データのみを抽出
    fnames = ["../hutagami/data/L01-18_%s.geojson"%i for i in ['08', '11', '12', '13', '14']]
    geo_df = pd.DataFrame(columns=['地価(単位面積)', 'loc_lat', 'loc_lon'])
    for fname in fnames:
        geo = gpd.read_file(fname)
        #sample = pd.DataFrame((geo['L01_006'].astype(int) * geo['L01_024'].astype(int)), columns=['地価'])
        sample = pd.DataFrame(geo['L01_006'].astype(int).values, columns=['地価(単位面積)'])
        sample['geometry'] = geo['geometry']
        sample['lonlat'] = sample['geometry'].astype(str).str.replace('POINT ', '').str.replace('(', '').str.replace(')', '').str.split(' ')
        sample['loc_lat'] = sample['lonlat'].map(lambda x: float(x[1]))
        sample['loc_lon'] = sample['lonlat'].map(lambda x: float(x[0]))
        sample = sample[['地価(単位面積)', 'loc_lat', 'loc_lon']]
        geo_df = pd.concat([geo_df, sample], axis=0).reset_index(drop=True)

    fnames = ["../hutagami/data/L02-18_%s.geojson"%i for i in ['08', '11', '12', '13', '14']]
    #geo_df = pd.DataFrame(columns=['地価(単位面積)', 'loc_lat', 'loc_lon'])
    for fname in fnames:
        geo = gpd.read_file(fname)
        #sample = pd.DataFrame((geo['L01_006'].astype(int) * geo['L01_024'].astype(int)), columns=['地価'])
        sample = pd.DataFrame(geo['L02_006'].astype(int).values, columns=['地価(単位面積)'])
        sample['geometry'] = geo['geometry']
        sample['lonlat'] = sample['geometry'].astype(str).str.replace('POINT ', '').str.replace('(', '').str.replace(')', '').str.split(' ')
        sample['loc_lat'] = sample['lonlat'].map(lambda x: float(x[1]))
        sample['loc_lon'] = sample['lonlat'].map(lambda x: float(x[0]))
        sample = sample[['地価(単位面積)', 'loc_lat', 'loc_lon']]
        geo_df = pd.concat([geo_df, sample], axis=0).reset_index(drop=True)


    n_neighbors = k # default=5
    # 距離によって重み付けするか
    #weights = 'uniform' # しない
    weights = 'distance' # する

    knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights)
    knn.fit(geo_df.drop(['地価(単位面積)'], axis=1), geo_df['地価(単位面積)'])

    res = knn.kneighbors(df[['loc_lat', 'loc_lon']], n_neighbors, return_distance=False)
    res = geo_df['地価(単位面積)'].loc[res.flatten()].values.flatten().reshape(res.shape).astype(int)
    res = pd.DataFrame(res, columns=['地価(単位面積)_neighbor_%s'%(i+1) for i in range(n_neighbors)], index=df.index).astype(float)
    for i in range(n_neighbors):
        res['地価_neighbor_%s'%(i+1)] = (res['地価(単位面積)_neighbor_%s'%(i+1)] * df['面積']).astype(float)

    res['地価_neighbor_mean'] = res[['地価_neighbor_%s'%(i+1) for i in range(n_neighbors)]].mean(axis=1).astype(float)
    res['地価_neighbor_std'] = res[['地価_neighbor_%s'%(i+1) for i in range(n_neighbors)]].std(axis=1).astype(float)
    res['地価(単位面積)_neighbor_mean'] = res[['地価(単位面積)_neighbor_%s'%(i+1) for i in range(n_neighbors)]].mean(axis=1).astype(float)
    res['地価(単位面積)_neighbor_std'] = res[['地価(単位面積)_neighbor_%s'%(i+1) for i in range(n_neighbors)]].std(axis=1).astype(float)

    #各近傍点のデータがいらない場合は下のコードを使う
    #res = res[['地価_neighbor_mean', '地価_neighbor_std', '地価(単位面積)_neighbor_mean', '地価(単位面積)_neighbor_std']]

    df = pd.concat([df, res], axis=1)

    return df

In [36]:
takuchi_df=pd.read_csv('./team_data/2019_TAKUCHI_k_13_after.csv',encoding='cp932')
station_df=pd.read_csv('./team_data/tokyo_station.csv',index_col=0)
station_df=station_df.rename(columns={'N02_005':'nearest_sta'})
station_df['nearest_sta']=station_df['nearest_sta'].apply(lambda x:x+'駅')

In [37]:
df=df.merge(station_df[['nearest_sta','駅のサイズ(路線数)','乗降人数']],how='left',on='nearest_sta').drop_duplicates(subset='id').reset_index(drop=True)

In [38]:
df = df.drop(['アクセス','キッチン','バス・トイレ','周辺環境','室内設備','所在地','放送・通信','駅名','lat_sta','distances','線名','駅名'],axis=1)



In [39]:
%%time
col='nearest_sta'
df.loc[3538, col] = '六町駅'
df.loc[3575, col] = '仙川駅'
df.loc[3657, col] = '六町駅'
df.loc[8979, col] = '西新井大師西駅'
df.loc[9661, col] = '西新井大師西駅'
df.loc[12681, col] = '八潮駅'
df.loc[18340, col] = '八潮駅'
df.loc[19394, col] = '仙川駅'
df.loc[19927, col] = '仙川駅'
df.loc[23470, col] = '新小岩駅'
df.loc[25446, col] = '志村三丁目駅'
df.loc[32832, col] = '仙川駅'
df.loc[32907, col] = '八潮駅'
df.loc[35918, col] = '千川駅'
df.loc[36331, col] = '大泉学園駅'
df.loc[38358, col] = '八潮駅'
df.loc[39702, col] = '仙川駅'
df.loc[40151, col] = '八潮駅'
df.loc[41485, col] = '千歳烏山駅'
df.loc[42060, col] = '仙川駅'
df.loc[44711, col] = '八潮駅'
df.loc[48243, col] = '王子駅'
df.loc[49879, col] = '八潮駅'
df.loc[54136, col] = '仙川駅'
df.loc[58322, col] = '浮間舟渡駅'
df.loc[58727, col] = '仙川駅'
df.loc[59459, col] = '八潮駅'
df.loc[61075, col] = '谷塚駅'
## ------最寄り駅修正------
df = get_manshon(df,'nearest_sta')

CPU times: user 159 ms, sys: 18 µs, total: 159 ms
Wall time: 190 ms


In [48]:
def get_tokyo_station(df, load_mode=True):
    '''
    利用可能駅数と路線数を取得
    '''
    # 座標系変換
    from joblib import Parallel, delayed
    # 駅の位置を取得
    tokyo_station = pd.read_csv('../hutagami/data/tokyo_station.csv', index_col='Unnamed: 0')
    xy = pd.read_csv('../hutagami/data/location_xy.csv', index_col='index')
    xy.index = range(1, len(xy)+1)
    df['x'] = xy['x'].loc[df['id'].values].values
    df['y'] = xy['y'].loc[df['id'].values].values
        
    # ここからが処理
    # 利用可能駅数と路線数
    def in_distance(df_i, com_df=tokyo_station, distance=1000):
        y = df_i[0]
        x = df_i[1]
        # xyで範囲内に入っているもののみをカウント
        cond = np.sqrt((com_df['x'] - x)**2 + (com_df['y'] - y)**2)
        com_ = com_df[cond < distance]
        cond_ = cond[cond < distance]
        min_index = cond_[cond_==cond_.min()].index

        avail_station = len(com_['N02_005'].unique())
        avail_root = len(com_['N02_003'].unique())
        min_distance = cond_.min()
        nearest_station_size = (list(com_df.loc[min_index, '駅のサイズ(路線数)'])+[0])[0]
        max_station_size = com_['駅のサイズ(路線数)'].max()
        nearest_station_people = (list(com_df.loc[min_index, '乗降人数'])+[0])[0]
        max_station_people = com_['乗降人数'].max()

        return avail_station, avail_root, min_distance, nearest_station_size, max_station_size, nearest_station_people, max_station_people

    df_i = df[['y', 'x']].apply(lambda x: (x[0], x[1]), axis=1)
    res = Parallel(n_jobs=6)([delayed(in_distance)(i) for i in df_i])
    
    df['利用可能駅数']= np.array(res)[:, 0]
    df['利用可能路線数']= np.array(res)[:, 1]
    df['最短駅m']= np.array(res)[:, 2]
    df['最短駅路線数']= np.array(res)[:, 3]
    df['利用可能駅最大路線数']= np.array(res)[:, 4]
    df['最短駅乗降人数']= np.array(res)[:, 5]
    df['利用可能駅最大乗降人数']= np.array(res)[:, 6]
    
    return df

In [49]:
def get_landprice(df, k=3):
    '''
    地価のk近傍点を取得
    '''
    import geopandas as gpd
    from sklearn.neighbors import KNeighborsRegressor
    
    # 地価データを取得して、緯度経度と地価データのみを抽出
    fnames = ["../hutagami/data/L01-18_%s.geojson"%i for i in ['08', '11', '12', '13', '14']]
    geo_df = pd.DataFrame(columns=['地価(単位面積)', 'loc_lat', 'loc_lon'])
    for fname in fnames:
        geo = gpd.read_file(fname)
        #sample = pd.DataFrame((geo['L01_006'].astype(int) * geo['L01_024'].astype(int)), columns=['地価'])
        sample = pd.DataFrame(geo['L01_006'].astype(int).values, columns=['地価(単位面積)'])
        sample['geometry'] = geo['geometry']
        sample['lonlat'] = sample['geometry'].astype(str).str.replace('POINT ', '').str.replace('(', '').str.replace(')', '').str.split(' ')
        sample['loc_lat'] = sample['lonlat'].map(lambda x: float(x[1]))
        sample['loc_lon'] = sample['lonlat'].map(lambda x: float(x[0]))
        sample = sample[['地価(単位面積)', 'loc_lat', 'loc_lon']]
        geo_df = pd.concat([geo_df, sample], axis=0).reset_index(drop=True)
        
    fnames = ["../hutagami/data/L02-18_%s.geojson"%i for i in ['08', '11', '12', '13', '14']]
    #geo_df = pd.DataFrame(columns=['地価(単位面積)', 'loc_lat', 'loc_lon'])
    for fname in fnames:
        geo = gpd.read_file(fname)
        #sample = pd.DataFrame((geo['L01_006'].astype(int) * geo['L01_024'].astype(int)), columns=['地価'])
        sample = pd.DataFrame(geo['L02_006'].astype(int).values, columns=['地価(単位面積)'])
        sample['geometry'] = geo['geometry']
        sample['lonlat'] = sample['geometry'].astype(str).str.replace('POINT ', '').str.replace('(', '').str.replace(')', '').str.split(' ')
        sample['loc_lat'] = sample['lonlat'].map(lambda x: float(x[1]))
        sample['loc_lon'] = sample['lonlat'].map(lambda x: float(x[0]))
        sample = sample[['地価(単位面積)', 'loc_lat', 'loc_lon']]
        geo_df = pd.concat([geo_df, sample], axis=0).reset_index(drop=True)
    
    print(len(geo_df))
         
    n_neighbors = k # default=5
    # 距離によって重み付けするか
    #weights = 'uniform' # しない
    weights = 'distance' # する
    
    knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights)
    knn.fit(geo_df.drop(['地価(単位面積)'], axis=1), geo_df['地価(単位面積)'])
    
    res = knn.kneighbors(df[['loc_lat', 'loc_lon']], n_neighbors, return_distance=False)
    res = geo_df['地価(単位面積)'].loc[res.flatten()].values.flatten().reshape(res.shape).astype(int)
    res = pd.DataFrame(res, columns=['地価(単位面積)_neighbor_%s'%(i+1) for i in range(n_neighbors)], index=df.index).astype(float)
    for i in range(n_neighbors):
        res['地価_neighbor_%s'%(i+1)] = (res['地価(単位面積)_neighbor_%s'%(i+1)] * df['面積']).astype(float)
        
    res['地価_neighbor_mean'] = res[['地価_neighbor_%s'%(i+1) for i in range(n_neighbors)]].mean(axis=1).astype(float)
    res['地価_neighbor_std'] = res[['地価_neighbor_%s'%(i+1) for i in range(n_neighbors)]].std(axis=1).astype(float)
    res['地価(単位面積)_neighbor_mean'] = res[['地価(単位面積)_neighbor_%s'%(i+1) for i in range(n_neighbors)]].mean(axis=1).astype(float)
    res['地価(単位面積)_neighbor_std'] = res[['地価(単位面積)_neighbor_%s'%(i+1) for i in range(n_neighbors)]].std(axis=1).astype(float)
    
    df = pd.concat([df, res], axis=1)
    
    return df

In [50]:
%%time
df = get_landprice(df)
df = get_tokyo_station(df)

12043
CPU times: user 12.8 s, sys: 221 ms, total: 13 s
Wall time: 40.2 s


In [53]:
train=df[~df['賃料'].isnull()]
test=df[df['賃料'].isnull()]

In [54]:
train.loc[train['id']==7492,'面積']=51.83
train.loc[train['id']==5776,'賃料']/=10
train = train[train['築年数']<6000]
train = train[train['面積']<400]

In [55]:
train.to_csv('./input/prep_train1107.csv',index=None)
test.to_csv('./input/prep_test1107.csv',index=None)