In [97]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import sys
import mojimoji
import re
from pyproj import Geod
from datetime import datetime
from sklearn.preprocessing import StandardScaler

#表示カラム数
pd.set_option("display.max_columns", 100)
sc = StandardScaler()

saitama_path = "../input/tabelog_store_data_saitama.csv"
shitamachi_path = "../input/tabelog_store_data_shitamachi.csv"
roppongi_path = "../input/tabelog_store_data_roppongi.csv"
train_path = "../input/tabelog_train_saitama.csv"
pred_path = "../input/tabelog_pred_roppongi.csv"
origin_saitama_path = "../input/tabelog_origin_saitama.csv"
origin_roppongi_path = "../input/tabelog_origin_roppongi.csv"
# eda_path = "../output/tabelog_origin_rate.csv"
# eda_path = "../output/tabelog_origin_rate_minato.csv"
eda_path = "../output/tabelog_eda.csv"
station_info = "../input/station_info.csv"

genre_list= [
    'izakaya',
    'ita_fre',
    'bal',
    'dinning',
    'meat',
    'sushi',
    'japan',
    'nabe',
    'china',
    'countries',
    'ramen',
    'light',
    'cook'
]

genre_dict = {
    'izakaya':[
    '居酒屋',
    '焼鳥',
    '串揚げ',
    '串かつ',
    '串焼き',
    '沖縄',
    '魚',
    '海鮮'
    ],
    'ita_fre':[
        'イタリアン',
        'パスタ',
        'ピザ',
        '西洋',
        'フレンチ'
    ],
    'bal':[
        'バル',
        'バール',
        'ビストロ',
        'スペイン',
        '地中海'
    ],
    'dinning':[
        'ダイニングバー',
        'ダイニング',
        'バー',
        'ビア',
        'アメリカ',
        'ドイツ',
        'ラウンジ'
    ],
    'meat':[
        '焼肉',
        'ホルモン',
        'ジンギスカン',
        '肉'
    ],
    'sushi':['寿司'],
    'japan':[
        '和食',
        '割烹',
        '懐石',
        '会席',
        '郷土',
        'うなぎ',
        'ふぐ',
        'かに',
        'ろばた',
        '天ぷら'
    ],
    'nabe':[
        '鍋',
        'もつ鍋',
        'しゃぶしゃぶ',
        'すきやき',
        'すき焼き',
        '水炊き'
    ],
    'china':[
        '中華',
        '餃子',
        '韓国',
        '四川',
        '広東',
        '台湾',
        '中国',
        '上海'
    ],
    'countries':[
        'カレー',
        'インド',
        '各国',
        'アジア',
        'エスニック',
        'メキシコ',
        'ベトナム',
        '南米',
        'シンガポール',
        'ネパール',
        'トルコ',
        'ブラジル',
        'パキスタン'
    ],
    'ramen':[
        'ラーメン',
        'つけ麺',
        '麺'
    ],
    'light':[
        'そば',
        'うどん',
        '洋食',
        '丼',
        'ステーキ',
        'お好み焼き',
        'ハンバーグ'
    ],
    'cook':[
        '創作料理',
        '鳥料理',
        '鉄板焼き',
        '牛料理'
    ]
}

float_list = [
    'couple_flg',
    'coupon_flg',
    'dinner_budget',
    'dinner_flg',
    'hideout_flg',
    'kodawari_flg',
    'latitude',
    'longitude',
    'lunch_budget',
    'lunch_flg',
    'net_reserve_flg',
    'night_view_flg',
    'nomiho_flg',
    'osya_flg',
    'private_flg',
    'rate',
    'relax_flg',
    'review',
    'sake_flg',
    'sommelier_flg',
    'tabeho_flg',
    'toll_flg',
    'vegetable_flg',
    'wine_flg'
#     'distance_station',
#     'longitude_x',
#     'latitude_y'
]

# 標準化するカラム（ここはNULLのないカラムのみ）
stan_columns = [
    'rate',
    'review',
    'genre_cnt',
#     'distance_origin_x',
#     'distance_origin_y'
    'distance_station'
]

stan_null_columns = [
    'dinner_budget',
    'lunch_budget',
    'seat',
    'open_date'
]

station_dict = {
    '新橋駅':'shinbashi',
    '六本木駅':'roppongi',
    '六本木一丁目駅':'roppongi-1',
    '麻布十番駅':'azabuzyuban',
    '三田駅':'mita',
    '池袋駅':'ikebukuro',
    '森下駅':'morishita',
    '錦糸町駅':'kinshityo',
    '浅草橋駅':'asakusabashi',
    '上野駅':'ueno',
    '志木駅':'shiki',
    '川越駅':'kawagoe',
    '朝霞台駅':'asakadai',
    'ふじみ野駅':'huzimino',
    '大宮駅':'oomiya',
    '浦和駅':'urawa',
    '所沢駅':'tokorozawa',
    '上福岡駅':'kamihukuoka',
    '川口駅':'kawaguchi',
    '熊谷駅':'kumagaya',
    '草加駅':'soka',
    '春日部駅':'kasukabe',
    'さいたま新都心駅':'shintoshin'
}

def spt(x): return x.split("、")


# 複数ジャンル表記を分割して一カラム一ジャンルに変換
def genre_trans(df): 
    df.genre = df.genre.astype('str')
    return df['genre'].apply(lambda x:pd.Series(spt(x) if len(spt(x))==3 
                                                else spt(x)+['0']   if len(spt(x))==2 
                                                else spt(x)+['0']*2 if len(spt(x))==1
                                                else spt(x)+['0']*3 if len(spt(x))==0 
                                                else ['0','0','0']))


# ジャンル作成時のみ使用
def extract_genre_list():

    data = pd.read_csv(input_path)
    print(data.head())

    genre_data = genre_trans(data)

    a2 = np.append(np.append(genre_data[0].values, genre_data[1].values), genre_data[2].values)
    u, c = np.unique(a2, return_counts=True)
    genre_dict = dict(zip(u, c))
    df = pd.Series(genre_dict).sort_values(ascending=False)
    print(df.head(10))

    df.to_csv(output_path, index=True, encoding="utf-8")
    
    
# 日本語表記のジャンルをラベルに変換    
def genre_label(x):
    for genre in genre_list:
        for check in genre_dict[genre]:
            if x.count(check):
                return genre
    return 'other'

# VIS用データ作成時のみ
def eda_genre(data):

    genre_data = genre_trans(data)
    df = pd.concat([data.name, genre_data], axis=1)
    
    df.drop([1,2], axis=1, inplace=True)
    label = df.apply(lambda x:pd.Series([x['name'], genre_label(x[1])]), axis=1)
    label.rename(columns={0:'name', 1:'genre'}, inplace=True)
    return label
    
    
def genre_dummies(data):
    
    genre_data = genre_trans(data)
    df = pd.concat([data.name, genre_data], axis=1)
    
    for i in range(3):
        if i == 0:tmp = df[['name', i]].rename(columns={i:'genre'})
        else :
            tmp2 = df[['name', i]].rename(columns={i:'genre'})    
            tmp  = pd.concat([tmp, tmp2[['name', 'genre']]], axis=0)
    
    tmp.fillna('0', inplace=True)
    
    label = tmp.apply(lambda x: genre_label(x[1]), axis=1)
    
    tmp_result = pd.concat([tmp.name, label], axis=1).drop_duplicates().reset_index(drop=True)
    
    # 各ジャンルの出現回数をカウント
    u, c = np.unique(np.array(label), return_counts=True)
    tmp_cnt   = dict(zip(u, c))
    tmp_cnt['other'] = 30 # otherは中の1ジャンルあたりの平均出現回数にする
    genre_cnt = pd.Series(tmp_cnt)
    genre_cnt = genre_cnt.reset_index().rename(columns={'index':'genre'})
    
    tmp_result.rename(columns={0:'genre'}, inplace=True)
    tmp_result = tmp_result.merge(genre_cnt, on='genre', how='inner')
    
    # ダミー変数化
    dummies = pd.get_dummies(tmp_result.genre)
    
    # 最終的なジャンルの説明変数を作成
    result= pd.concat([tmp_result, dummies], axis=1)
    columns = list(result.columns) # genre カラムを削除
    columns.pop(1)
    df_dummies = result[columns].groupby('name', as_index=False).sum()
    df_dummies.rename(columns={0:'genre_cnt'}, inplace=True)
    
    return df_dummies


def cleansing(x):
    x = x.replace(' ', '')
    x = x.replace(r'''["''', '')
    x = x.replace(r'''"]''', '')
    x = x.replace(r'''-2''', '0')
    return x


# 駅名をローマ字に変換
def station_romaji(x): return station_dict[x]

# 住所をダミー変数化。多いのでとりあえず座標verと比較
def street_dummies(df):
    
    # なぜか番地記入欄に区まで書いてる阿呆がいるので前処理
    regex_street = re.compile(r'''(.*?)区(.*?)''')
    df = df.apply(lambda x: pd.Series([x[0]] + x[1].split('区') if x[1].count('区') else [x[0], '港', x[1]]), axis=1)
    
    df = df[df[1]=='港'][[0,2]].rename(columns={0:'name', 2:'street'}).reset_index(drop=True)
    
    df.street = df.street.map(lambda x:cleansing(x))
    
    for i in  df.street.drop_duplicates():
        print(i)
    
    dummies = pd.get_dummies(df['street'])
    
    return pd.concat([df, dummies], axis=1).drop('street', axis=1)


# 緯度経度から簡単に距離と方角を求める（球面三角法）
def distance(longitude_point, latitude_point, longitude_target, latitude_target):
    
    r = 6378137.0 # 赤道半径
    
    x_point = np.radians(longitude_point)
    y_point = np.radians(latitude_point)
    x_target = np.radians(longitude_target)
    y_target = np.radians(latitude_target)

    x_avg = (x_target - x_point)/2
    y_avg = (y_target - y_point)/2
    
    val = np.sqrt(np.power(np.sin(x_avg), 2) + np.cos(x_point) * np.cos(x_target) * np.power(np.sin(y_avg), 2))
    
    return r * 2 * np.arcsin(val)
    

# オープン日と現在の日数差を求める    
def ym_trans(d):
    
    if str(type(d)).count('float') or str(type(d)).count('int'):return None
    elif d is None:return None
    
    d = mojimoji.zen_to_han(d)
    
    if d.count('日'):
        if len(d)==9: ym = d[:4] + '/' + d[5]
        elif len(d)==10:
            ym = d[:7].rstrip('/')[:4] + '/' + d[:7].rstrip('/')[5:].replace('月', '')
        elif len(d)==11:
            ym = d[:4] + '/' + d[5:7]
        else:
            print('open date error')
            print(d)
            sys.exit()
            
    elif d.count('月'):
        if len(d)==7:ym = d[:4] + '/' + d[5]
        elif len(d)==8:
            ym = d[:4] + '/' + d[5:7]
        else:
            print('open date error')
            print(d)
            sys.exit()
   
    elif d.count('年'):
        ym = d[:4] + '/' + '1'
        
    else:
        print('open date error')
        print(d)
        sys.exit()
        
    # オープンから何日経ったか計算
    dt = datetime.strptime(ym, '%Y/%m')
    now = datetime.now()
        
    return (now-dt).days


# 標準化 NULLを含むカラムはsclearn使えん
def scikit_stan(x):
    sc.fit(x)
    return sc.transform(x)


def standardization(x):
    x = x.dropna()
    mu = x.mean()
    se = x.std()
    return (x-mu)/se
   
    
def ml_data(df):
    for col in stan_null_columns:
        df[col] = standardization(df[col])
    
    for col in stan_columns:
        df[col] = scikit_stan(df.loc[:, col].reshape(-1,1))
        
    return df
    
    
def preprocessing(data, eda_flg):
    
    data = data[data.seat!='seat'] # 誤ってカラム名を途中に入れてしまった場合の対処
    data[float_list] = data[float_list].astype('float')
    data = data[data.rate!=0]
    data = data[data.rate!=0.00]
    data = data.drop_duplicates()
    
    # オープン日を年月に変換
    data.open_date = data.open_date.map(lambda x:ym_trans(x))
    
    # 駅の緯度経度情報を付与
    station_geo = pd.read_csv(station_info)
    data = data.merge(station_geo, on='station', how='inner')
    data.station = data.station.map(lambda x:station_romaji(x))
    
    
    # 緯度経度から駅までの距離を計算
    data['distance_station'] = data[['station_longitude', 'station_latitude', 'longitude', 'latitude']].apply(lambda x:distance(x[0], x[1], x[2], x[3]), axis=1)
    
    # 重複店舗は最寄り駅のデータのみ残す
    select_station = data[['name', 'distance_station']].groupby(['name'], as_index=False).min()
    data = data.merge(select_station, on=['name', 'distance_station'], how='inner')
    
    # 席情報のクレンジング
    data.seat = data.seat.astype('str')
    data.seat = data.seat.map(lambda x :mojimoji.zen_to_han(x) if x!='nan' else None)
    data.seat = data.seat.astype('float')
    
#     EDA用データ
    if eda_flg==1:
        eda = data.copy()
        genre = eda_genre(data)
        eda.drop('genre', axis=1, inplace=True)
        eda = eda.merge(genre, on='name', how='inner').drop_duplicates()
        return eda

    # ジャンルをダミー変数化
    df_dummies = genre_dummies(data)
    
    data = data.merge(df_dummies, on='name', how='inner').drop_duplicates()
    
    print(data.name.drop_duplicates().count())
    
    data = data.drop(['genre', 'pr_comment', 'local', 'street', 'station', 'longitude', 'latitude', 'station_longitude', 'station_latitude'], axis=1)
#     data = data.drop(['name', 'genre', 'local', 'street', 'station', 'longitude', 'latitude', 'station_longitude', 'station_latitude'], axis=1)
     
    train = ml_data(data)
    
    train.to_csv(train_path, index=False, encoding='utf-8')
#     train.to_csv(pred_path, index=False, encoding='utf-8')
    
    print("店舗数")
    print(train.count())
    print("学習データ")
    print(train.head())
    
    return train
    

def main():
    
#     data = pd.read_csv(saitama_path)
    saitama = pd.read_csv(saitama_path)
#     shitamachi = pd.read_csv(shitamachi_path)
    roppongi = pd.read_csv(roppongi_path)
#     data = pd.read_csv(roppongi_path)
    
    data = pd.concat([saitama, roppongi], axis=0)
    
#     result = preprocessing(data, 0)
    result = preprocessing(data, 1)
    
    print(result.head())
    print(result.count())
    
#     result.to_csv(train_path, index=False, encoding='utf-8')
#     result.to_csv(pred_path, index=False, encoding='utf-8')
#     result.to_csv(origin_saitama_path, index=False, encoding='utf-8')
#     result.to_csv(origin_roppongi_path, index=False, encoding='utf-8')
    result.to_csv(eda_path, index=False, encoding='utf-8')
    sys.exit()
    
    
if __name__ =='__main__':

    main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


   couple_flg  coupon_flg  dinner_budget  dinner_flg  hideout_flg  \
0         0.0         1.0         2000.0         1.0          0.0   
1         0.0         1.0         3000.0         1.0          0.0   
2         0.0         0.0         2000.0         1.0          0.0   
3         0.0         1.0         4000.0         1.0          0.0   
4         0.0         1.0         3000.0         1.0          0.0   

   kodawari_flg    latitude local  longitude  lunch_budget  lunch_flg  \
0           1.0  139.573677   新座市  35.822356           NaN        0.0   
1           1.0  139.575336   新座市  35.823145           NaN        0.0   
2           1.0  139.575765   志木市  35.823659        1000.0        1.0   
3           1.0  139.573310   新座市  35.820906           NaN        0.0   
4           1.0  139.574071   新座市  35.820543        1000.0        1.0   

                           name  net_reserve_flg  night_view_flg  nomiho_flg  \
0           水炊き・焼き鳥 とりいちず 志木南口店              1.0             0.0  

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [22]:
from math import sin, cos, tan, atan2, acos, pi

def azimuth(x1, y1, x2, y2):
    # Radian角に修正
    _x1, _y1, _x2, _y2 = x1*pi/180, y1*pi/180, x2*pi/180, y2*pi/180
    Δx = _x2 - _x1
    _y = sin(Δx)
    _x = cos(_y1) * tan(_y2) - sin(_y1) * cos(Δx)

    psi = atan2(_y, _x) * 180 / pi
    if psi < 0:
        return 360 + atan2(_y, _x) * 180 / pi
    else:
        return atan2(_y, _x) * 180 / pi

def distance(x1, y1, x2, y2, r):
    _x1, _y1, _x2, _y2 = x1*pi/180, y1*pi/180, x2*pi/180, y2*pi/180
    Δx = _x2 - _x1
    val = sin(_y1) * sin(_y2) + cos(_y1) * cos(_y2) * cos(Δx)
    return r * acos(val)

x1 = 139.988909
y1 = 35.685828
x2 = 139.990339
y2 = 35.685879
r  = 6378.137e3

angle = azimuth(x1, y1, x2, y2)
dis   = distance(x1, y1, x2, y2, r) / 1e3 # kmに変換
print("方位角 : {0:.3f} 度".format(angle))
print("距離 : {0:.3f} km".format(dis))

# 結果
# 方位角 : 87.485 度
# 距離 : 0.129 km

2016年12


'2'