In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib.font_manager as fm
import seaborn as sns
import warnings
from scipy.special import boxcox1p, inv_boxcox1p

pd.options.display.float_format = '{:.5f}'.format
fontpath = 'C:/Users/TaeSoo/AppData/Local/Microsoft/Windows/Fonts/NanumGothic.ttf'
%matplotlib inline
warnings.filterwarnings(action='ignore')

plt.rcParams['figure.dpi'] = 140
plt.rcParams['font.family'] = 'NanumGothic'

In [3]:
train = pd.read_csv('../원본데이터/train.csv')
test = pd.read_csv('../원본데이터/test.csv')

In [4]:
train_df = train.copy()
test_df = test.copy()

In [7]:
dataset = pd.concat([train_df, test_df], axis = 0)

In [9]:
na_df = dataset[dataset['road_name'] == '-']

In [11]:
na_road_name_df = pd.DataFrame({'count' : na_df.groupby(['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude']).size()}).reset_index()
na_road_name_df

Unnamed: 0,start_latitude,start_longitude,end_latitude,end_longitude,count
0,33.24343,126.42203,33.24488,126.41909,6221
1,33.24488,126.41909,33.24343,126.42203,6325
2,33.24488,126.41909,33.24586,126.41764,5963
3,33.24586,126.41764,33.24488,126.41909,6086
4,33.24586,126.41764,33.24615,126.41721,5963
...,...,...,...,...,...
119,33.47339,126.40913,33.47228,126.41844,5344
120,33.47545,126.38819,33.47116,126.39515,5330
121,33.48526,126.41090,33.47228,126.41844,5190
122,33.55159,126.76939,33.55608,126.75973,3107


In [12]:
from geopy.geocoders import Nominatim

In [13]:
def geocoding_reverse(lat_lng_str): 
    geolocoder = Nominatim(user_agent = 'South Korea', timeout=None)
    address = geolocoder.reverse(lat_lng_str)

    return address

In [14]:
def searching_suburb(df, latitude, longitude) :
    real_suburb_Name = []
    for idx in range(len(df)) :
        real_suburb_Name.append(geocoding_reverse(str(df.loc[idx, latitude]) + ', ' + str(df.loc[idx, longitude])))
    return real_suburb_Name

In [15]:
def making_address_col (address) :
    country = []
    post_code = []
    state = []
    city = []
    suburb = []
    street_address = []

    for idx in range(len(address)) :
        temp_address_list = address[idx][0].split(', ')
        country.append(temp_address_list[-1])

        if temp_address_list[-2].isdigit() == True :
            post_code.append(temp_address_list[-2])
            state.append(temp_address_list[-3])
            if len(temp_address_list) == 4 :
                city.append(temp_address_list[-4])
                suburb.append(np.nan)
                street_address.append(np.nan)
            elif len(temp_address_list) == 5 :
                city.append(temp_address_list[-4])
                suburb.append(temp_address_list[-5])
                street_address.append(np.nan)                
            else :
                city.append(temp_address_list[-4])
                suburb.append(temp_address_list[-5])
                street_address.append(temp_address_list[-6])  

        else :
            post_code.append(np.nan)
            state.append(temp_address_list[-2])
            if len(temp_address_list) == 3 :
                city.append(temp_address_list[-3])
                suburb.append(np.nan)
                street_address.append(np.nan)
            elif len(temp_address_list) == 4 :
                city.append(temp_address_list[-3])
                suburb.append(temp_address_list[-4])
                street_address.append(np.nan)                
            else :
                city.append(temp_address_list[-3])
                suburb.append(temp_address_list[-4])
                street_address.append(temp_address_list[-5])  

    address_df = pd.DataFrame(data = list(zip(country, post_code, city, suburb, street_address)), columns = ['country', 'post_code', 'city', 'suburb', 'street_address'])
    return address_df

In [16]:
start_address = searching_suburb(na_road_name_df, 'start_latitude', 'start_longitude')
end_address = searching_suburb(na_road_name_df, 'end_latitude', 'end_longitude')

In [17]:
start_address_df = making_address_col(start_address)
end_address_df = making_address_col(end_address)

In [18]:
road_df = pd.concat([start_address_df[['suburb', 'street_address']], end_address_df[['suburb', 'street_address']]], axis = 1)
road_df.columns = ['start_suburb', 'start_street', 'end_suburb', 'end_street']

In [19]:
geo_road_df = pd.concat([na_road_name_df[['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude']], road_df[['start_suburb', 'start_street', 'end_suburb', 'end_street']]], axis = 1)
geo_road_df['road_by_suburb'] = np.nan
geo_road_df['road_by_street'] = np.nan

In [20]:
geo_road_df.loc[(geo_road_df['start_suburb'] == geo_road_df['end_suburb']) & (geo_road_df['start_suburb'].str[-1] == '로'), 'road_by_suburb'] = geo_road_df['start_suburb']
geo_road_df.loc[(geo_road_df['start_suburb'].str[-1] == '로') & (geo_road_df['end_suburb'].str[-1] != '로'), 'road_by_suburb'] = geo_road_df['start_suburb']
geo_road_df.loc[(geo_road_df['start_suburb'].str[-1] != '로') & (geo_road_df['end_suburb'].str[-1] == '로'), 'road_by_suburb'] = geo_road_df['end_suburb']

geo_road_df.loc[(geo_road_df['start_street'] == geo_road_df['end_street']) & (geo_road_df['start_street'].str[-1] == '로'), 'road_by_street'] = geo_road_df['start_street']
geo_road_df.loc[(geo_road_df['start_street'].str[-1] == '로') & (geo_road_df['end_street'].str[-1] != '로'), 'road_by_street'] = geo_road_df['start_street']
geo_road_df.loc[(geo_road_df['start_street'].str[-1] != '로') & (geo_road_df['end_street'].str[-1] == '로'), 'road_by_street'] = geo_road_df['end_street']

In [21]:
geo_road_df['road_name'] = np.nan

geo_road_df.loc[geo_road_df['road_by_suburb'] == geo_road_df['road_by_street'], 'road_name'] = geo_road_df['road_by_suburb']
geo_road_df.loc[(geo_road_df['road_by_suburb'].isnull()) & (geo_road_df['road_by_street'].notnull()), 'road_name'] = geo_road_df['road_by_street']
geo_road_df.loc[(geo_road_df['road_by_suburb'].notnull()) & (geo_road_df['road_by_street'].isnull()), 'road_name'] = geo_road_df['road_by_suburb']

geo_road_df = geo_road_df.drop(['road_by_suburb', 'road_by_street'], axis = 1)

In [23]:
geo_road_df.loc[[10, 11], 'road_name'] = '천제연로'
geo_road_df.loc[[i for i in range(14, 20)], 'road_name'] = '지방도1132호선'
geo_road_df.loc[[23, 35], 'road_name'] = '위미항구로'
geo_road_df.loc[[27, 28], 'road_name'] = '지방도1131호선'
geo_road_df.loc[[36, 37], 'road_name'] = '지방도1119호선'
geo_road_df.loc[[52, 53], 'road_name'] = '지방도1136호선'
geo_road_df.loc[[54, 55, 56, 73], 'road_name'] = '난산로'
geo_road_df.loc[[57, 59, 62, 63, 72, 78, 79, 88], 'road_name'] = '한림상로'
geo_road_df.loc[[71, 74, 75, 76, 77, 80, 83, 84], 'road_name'] = '한림해안로'
geo_road_df.loc[[94, 95], 'road_name'] = '한수풀로'
geo_road_df.loc[101, 'road_name'] = '고성오조로'
geo_road_df.loc[[115, 121], 'road_name'] = '하광로'
geo_road_df.loc[[122, 123], 'road_name'] = '김녕로'

In [26]:
geo_road_df = geo_road_df.drop(['start_suburb', 'start_street', 'end_suburb', 'end_street'], axis = 1)
geo_road_df = geo_road_df.rename(columns = {'road_name': 'pre_road_name'})

In [27]:
geo_road_df.to_csv('road_naming.csv', encoding = 'utf-8-sig')