In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression as LR #線形回帰モデル
import seaborn as sns

In [None]:
#上限表示数を拡張
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 200)

In [None]:
#data check
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
train.columns

In [None]:
pg = sns.pairplot(train)

In [None]:
pg.savefig('seaborn_pairplot_default.png')

In [None]:
train_high = train[train['pm25_mid']>=300]

In [None]:
train_high

In [None]:
train_high['Country'].unique()

In [None]:
train_high['City'].unique()

In [None]:
train_high[train_high['Country']=='Hungary']

In [None]:
train[train['Country']=='China']

In [None]:
train['Country'].value_counts()

In [None]:
train.groupby('Country')['pm25_mid'].describe()

In [None]:
test['Country'].value_counts()

In [None]:
train['City'].value_counts()

In [None]:
city_data = train[train['City']=='Nanning'].copy()

In [None]:
city_data = city_data.reset_index()

In [None]:
#pm25グラフplot 
city_data["pm25_mid"].plot(figsize=(15,4))

In [None]:
city_data["co_mid"].plot(figsize=(15,4))

In [None]:
city_data['cosono']=city_data['co_mid']+city_data['so2_mid']+city_data['no2_mid']

In [None]:
city_data["cosono"].plot(figsize=(15,4))

In [None]:
df_all['co_range'] = df_all['co_max']-df_all['co_min']
df_all['o3_range']  = df_all['o3_max']-df_all['o3_min']
df_all['so2_range'] = df_all['so2_max']-df_all['so2_min']
df_all['no2_range'] = df_all['no2_max']-df_all['no2_min']
df_all['temperature_range'] = df_all['temperature_max']-df_all['temperature_min']
df_all['humidity_range'] = df_all['humidity_max']-df_all['humidity_min']
df_all['pressure_range'] = df_all['pressure_max']-df_all['pressure_min']
df_all['ws_range'] = df_all['ws_max']-df_all['ws_min']
df_all['dew_range'] = df_all['dew_max']-df_all['dew_min']

## country一致するかどうかのチェック

In [None]:
country_list_tr = train['Country'].unique()
country_list_ts = test['Country'].unique()

In [None]:
country_list_tr

In [None]:
country_list_ts

In [None]:
co_tr = set(country_list_tr)
co_ts = set(country_list_ts)
co_matched_list = list(co_tr & co_ts)

In [None]:
print(len(co_tr))
print(len(co_ts))
print(len(co_matched_list))

#### 完全一致！

## City一致するかどうかのチェック

In [None]:
city_list_tr = train['City'].unique()
city_list_ts = test['City'].unique()

In [None]:
src_set = set(city_list_tr)
tag_set = set(city_list_ts)
matched_list = list(src_set & tag_set)

In [None]:
print(len(city_list_tr))
print(len(city_list_ts))
print(len(matched_list))

#### 完全不一致！対策必要

## ヒュウベニ公式

In [None]:
import math

pole_radius = 6356752.314245                  # 極半径
equator_radius = 6378137.0                    # 赤道半径
def cal_distance(lat_1,lon_1,lat_2,lon_2):

    # 緯度経度をラジアンに変換
    lat_1_rad = math.radians(lat_1)
    lon_1_rad = math.radians(lon_1)
    lat_2_rad = math.radians(lat_2)
    lon_2_rad = math.radians(lon_2)

    lat_difference = lat_1_rad - lat_2_rad       # 緯度差
    lon_difference = lon_1_rad - lon_2_rad       # 経度差
    lat_average = (lat_1_rad + lat_2_rad) / 2    # 平均緯度

    e2 = (math.pow(equator_radius, 2) - math.pow(pole_radius, 2)) \
            / math.pow(equator_radius, 2)  # 第一離心率^2

    w = math.sqrt(1- e2 * math.pow(math.sin(lat_average), 2))

    m = equator_radius * (1 - e2) / math.pow(w, 3) # 子午線曲率半径

    n = equator_radius / w                         # 卯酉線曲半径

    distance = math.sqrt(math.pow(m * lat_difference, 2) \
                   + math.pow(n * lon_difference * math.cos(lat_average), 2)) # 距離計測
    return (distance/1000)

In [None]:
lat_kamata = 35.562479        # 蒲田駅の緯度経度
lon_kamata = 139.716073       # 蒲田駅の緯度経度
lat_yokosukachuo = 35.278699  # 横須賀中央駅の緯度経度
lon_yokosukachuo = 139.670040 # 横須賀中央駅の緯度経度
print(cal_distance(lat_kamata,lon_kamata,lat_yokosukachuo,lon_yokosukachuo))

In [None]:
city_loc_cols = ['City','lat', 'lon']
city_tr = train[city_loc_cols]

In [None]:
city_tr['City'].value_counts()

In [None]:
city_tr_ex = city_tr.drop_duplicates().copy()

In [None]:
len(city_tr_ex)

In [None]:
city_ts = test[city_loc_cols]

In [None]:
city_ts['City'].value_counts()

In [None]:
city_ts_ex = city_ts.drop_duplicates().copy()

In [None]:
len(city_ts_ex)

## test dataの近隣都市検索（一番近い都市を探す）

In [None]:
tr_city_list = city_tr_ex['City'].unique()
ts_city_list = city_ts_ex['City'].unique()

In [None]:
print(tr_city_list)
print(ts_city_list)

In [None]:
#test cityループ
for ts_city in ts_city_list:
    #列追加
    city_tr_ex[ts_city]=0


In [None]:
city_tr_ex

In [None]:
#tr_cityをベースにループ形成
for tr_city in tr_city_list:
    tr_idx = city_tr_ex.index[city_tr_ex['City']==tr_city][0]
    #地点1のlat,lon取得
    lat_1 = city_tr_ex.loc[tr_idx,'lat']
    lon_1 = city_tr_ex.loc[tr_idx,'lon']
    #test cityループ
    for ts_city in ts_city_list:
        #city_tr_ex[ts_city]='0'
        ts_idx = city_ts_ex.index[city_ts_ex['City']==ts_city][0]
        #地点2のlat,lon取得
        lat_2 = city_ts_ex.loc[ts_idx,'lat']
        lon_2 = city_ts_ex.loc[ts_idx,'lon']
        #距離算出
        #print(city_tr_ex.loc[tr_idx,ts_city])
        city_tr_ex.loc[tr_idx,ts_city] = cal_distance(lat_1,lon_1,lat_2,lon_2)


In [None]:
city_tr_ex

In [None]:
city_tr_ex[city_tr_ex['Adelaide']==city_tr_ex['Adelaide'].min()]['City'].values[0]

In [None]:
city_ts_ex['n_city']=' '

In [None]:
for ts_city in ts_city_list:
    ts_idx = city_ts_ex.index[city_ts_ex['City']==ts_city][0]
    city_ts_ex.loc[ts_idx,'n_city']=city_tr_ex[city_tr_ex[ts_city]==city_tr_ex[ts_city].min()]['City'].values[0]

In [None]:
city_ts_ex

In [None]:
city_ts_sel = city_ts_ex[['City','n_city']]

In [None]:
city_ts_sel.to_csv('change_city.csv',index=None)

In [None]:
city_tr_geo = city_tr_ex[['City','lat','lon']]

In [None]:
city_tr_geo

In [None]:
city_ts_geo = city_ts_ex[['City','lat','lon']]

In [None]:
city_tr_geo.to_csv('train_city.csv',index=None)
city_ts_geo.to_csv('test_city.csv',index=None)