In [1]:
import pandas as pd
import numpy as np
import datetime

In [15]:
def convert_price(str_price):
    if type(str_price) != str:
        return str_price
    if ('－' in str_price) | ('なし' in str_price):
        return 0.0
    elif '万円' in str_price:
        return float(str_price.replace('万円', '').replace(',', ''))*10000
    else:
        return float(str_price.replace('円', '').replace(',', ''))
    
    
def convert_area(str_area):
    if type(str_area) != str:
        return str_area
    if ('－' in str_area) | ('なし' in str_area):
        return 0.0
    elif ',' in str_area:
        #これは異常値なのでNaNを返す
        return np.nan
    else:
        return float(str_area.replace('m²', '').replace('（壁芯）','').replace('（内法）', ''))
    
def convert_age(str_age):
    if type(str_age) != str:
        return str_age
    pos_y = str_age.find('年')
    year = int(str_age[0:pos_y])
    pos_m = str_age.find('月')
    month = int(str_age[pos_y+1:pos_m])
    
    return (datetime.date.today().year - year)*12 + datetime.date.today().month - month

def convert_tosu(str_tosu):
    if type(str_tosu) != str:
        return str_tosu
    if str_tosu == '－':
        #これは欠損値なのでとりあえずNaNを返す
        return np.nan
    return int(str_tosu.replace('戸', '').replace(',', ''))

def convert_floor(str_floor):
    if type(str_floor) != str:
        return str_floor
    if str_floor == '－':
        # これは欠損値
        return np.nan
    pos_floor = str_floor.find('階')
    # 地下室は負の値にする
    return int(str_floor[0:pos_floor].replace('メゾネット', '').replace('地下','-'))
    

def convert_total_floor(str_floor):
    if type(str_floor) != str:
        return str_floor
    if str_floor == '－':
        # これは欠損値
        return np.nan
    pos_total_floor = str_floor.find('階')
    return int(str_floor[0:pos_total_floor].replace('地上', ''))

def convert_madori(str_madori):
    if str_madori == '－':
        return np.nan
    pos_madori = str_madori.find('(')
    return str_madori[0:pos_madori]

def convert_eki(str_koutu):
    if type(str_koutu) != str:
        return str_koutu
    pos = str_koutu.find('/')
    str_eki = str_koutu[pos+1:].replace(' ', '')
    return str_eki
    
def convert_toho(str_koutu):
    if type(str_koutu) != str:
        return str_koutu
    pos_toho = str_koutu.find('徒歩')
    if pos_toho < 0:
        return np.nan
    pos_hun = str_koutu.find('分')
    str_hun = str_koutu[pos_toho+2:pos_hun].replace(' ','')
    try:
        hun = int(str_hun)
        return hun
    except:
        print(str_koutu)
        return 0

In [20]:
df_data_ori = pd.read_csv('./data/bunjou_2018-5-29.csv')

In [21]:
delete_column = ['タイトル', 'その他交通', '所在地', '借地期間・地代（月額）', '権利金', '敷金 / 保証金', '維持費等',
                'その他一時金', '建物名・部屋番号', '建物状況調査　/　瑕疵保険・保証', '備考', 'リフォーム履歴', 'リノベーション履歴', 'バイク置き場',
                 '駐輪場', 'ペット', '敷地面積', '管理形態・方式', '国土法届出', '条件等', '現況', '引渡し', '物件番号',
                 '情報公開日', '次回更新予定日']

In [22]:
df_data_ori.columns

Index(['タイトル', '交通', 'その他交通', '所在地', '物件種目', '価格', '平米単価', '管理費等', '修繕積立金',
       '借地期間・地代（月額）', '権利金', '敷金 / 保証金', '維持費等', 'その他一時金', ' ', '建物名・部屋番号',
       '設備', '建物状況調査　/　瑕疵保険・保証', '備考', '間取り', '専有面積', 'バルコニー', '階建 / 階',
       '建物構造', '築年月', '総戸数', 'リフォーム履歴', 'リノベーション履歴', '駐車場', 'バイク置き場', '駐輪場',
       'ペット', '土地権利', '敷地面積', '管理形態・方式', '国土法届出', '条件等', '現況', '引渡し', '物件番号',
       '情報公開日', '次回更新予定日'],
      dtype='object')

In [23]:
df_kari = df_data_ori
df_kari = df_kari.drop(delete_column, axis=1)

In [24]:
price_column = ['価格', '平米単価', '管理費等', '修繕積立金']
area_column = ['専有面積', 'バルコニー']
for key in price_column:
    df_kari[key] = df_kari[key].map(convert_price)
for key in area_column:
    df_kari[key] = df_kari[key].map(convert_area)
    
df_kari['築年月'] = df_kari['築年月'].map(convert_age)
df_kari['総戸数'] = df_kari['総戸数'].map(convert_tosu)
df_kari['間取り'] = df_kari['間取り'].map(convert_madori)

df_floor = df_kari['階建 / 階'].str.split(' /', expand=True)
df_floor.columns = ['総階数', '階']
df_floor['総階数'] = df_floor['総階数'].map(convert_total_floor)
df_floor['階'] = df_floor['階'].map(convert_floor)
df_kari = pd.concat([df_kari, df_floor], axis=1)
df_kari = df_kari.drop('階建 / 階', axis=1)

df_transit = df_kari['交通'].str.split('駅', expand=True)
df_transit.columns = ['最寄り駅','駅から']
df_transit['最寄り駅'] = df_transit['最寄り駅'].map(convert_eki)
df_transit['駅から'] = df_transit['駅から'].map(convert_toho)
df_kari = pd.concat([df_kari, df_transit], axis=1)
df_kari = df_kari.drop('交通', axis=1)

In [25]:
abnormal_data = []    
for i in range(0, len(df_kari.index)):
    for key in df_kari.columns:
        if pd.isnull(df_kari[key][i]):
            abnormal_data.append(i)
print(abnormal_data)
#df_kari = df_kari.drop(abnormal_data, axis=0).reset_index(drop=True)
df_kari = df_kari.dropna(how='any').reset_index(drop='true')

[15, 19, 27, 29, 34, 42, 53, 54, 55, 58, 60, 64, 67, 73, 86, 90, 103, 112, 123, 128, 135, 140, 168, 177, 178, 179, 180, 183, 185, 193, 195, 199, 203, 205, 212, 222, 228, 238, 239, 240, 245, 246, 258, 263, 265, 271, 276, 281, 288, 297, 298, 309, 329, 340, 342, 347, 350, 354, 355, 365, 368, 375, 385, 405, 408, 415, 421, 430, 445, 458, 470, 481, 488, 492, 493, 507, 508, 520, 521, 524, 546, 552, 560, 563, 570, 579, 580, 582, 585, 598, 602, 605, 624, 647, 649, 653, 657, 660, 673, 678, 694, 695, 698, 699, 706, 712, 717, 720, 722, 736, 754, 781, 800, 810, 814, 824, 837, 838, 840, 844, 845, 860, 872, 876, 879, 899, 901, 903, 905, 929, 931, 967, 973, 980, 988, 998, 1010, 1025, 1053, 1058, 1063, 1069, 1088, 1107, 1122, 1130, 1139, 1165, 1166, 1172, 1179, 1180, 1196, 1203, 1206, 1232, 1235, 1245, 1270, 1290, 1291, 1299, 1306, 1333, 1343, 1365, 1366, 1392, 1396, 1397, 1413, 1419, 1422, 1427, 1428, 1444, 1450, 1453, 1458, 1462, 1496, 1512, 1513, 1519, 1521, 1525, 1542, 1544, 1574, 1581, 1607, 1610,

In [26]:
opts = {}
for i in range(0, len(df_kari['設備'])):
    #if df_kari['設備'][i]:
    setubi = df_kari['設備'][i].split("、")
    for j in range(0, len(setubi)):
        if setubi[j] in opts:
            opts[setubi[j]] += 1
        else:
            opts[setubi[j]] = 1
            
pd.DataFrame.from_dict(opts, orient='index').sort_values(by=0, ascending=False) 

Unnamed: 0,0
エレベーター,2660
システムキッチン,2426
浴室乾燥機,2118
都市ガス,2045
温水洗浄便座,2020
追焚機能,2014
室内洗濯機置場,1889
オートロック,1796
給湯,1781
モニター付インターホン,1739


In [41]:
df_kari.to_csv('list_eki.csv', columns=['最寄り駅'], index=False, header=False)