In [147]:
from math import radians, cos, sin, asin

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import h3
from tqdm import tqdm

# Reading and preprocessing data

## Reading Resources

In [28]:
target = pd.read_csv('data/target_hakaton_spb.csv').fillna(0)
target.rename(columns={'lat_h3': 'lat', 'lon_h3': 'lon'}, inplace=True)
target_only_count = target.groupby('geo_h3_10').atm_cnt.sum().to_frame('atm_cnt').merge(
    target[['geo_h3_10', 'lat', 'lon', 'city']], on='geo_h3_10', how='left').drop_duplicates()

538


In [31]:
osm_amenity = pd.read_csv('data/train/osm_amenity.csv').drop_duplicates(subset='geo_h3_10', inplace=True)
osm_stops = pd.read_csv('data/train/osm_stops.csv').drop_duplicates()
population = pd.read_csv('data/train/rosstat_population_all_cities.csv').drop_duplicates()
iso_walk = pd.read_csv('data/train/isochrones_walk_dataset.csv').drop_duplicates()
iso_drive = pd.read_csv('data/train/isochrones_drive_dataset.csv').drop_duplicates()


In [32]:
n_tram_stops = osm_stops[osm_stops['type'] == 'tram_stop'].groupby(by='geo_h3_10').count()['type'].to_frame(
    'n_tram_stops')
n_subway_entrances = osm_stops[osm_stops['type'] == 'subway_entrance'].groupby(by='geo_h3_10').count()['type'].to_frame(
    'n_subway_entrances')
n_bus_stops = osm_stops[osm_stops['type'] == 'bus_stop'].groupby(by='geo_h3_10').count()['type'].to_frame('n_bus_stops')


## Merging everything into one table

In [53]:
df = pd.merge(target_only_count, population, how='outer', on=['geo_h3_10', 'city', 'lat', 'lon'])
df = pd.merge(df, osm_amenity, how='outer', on=['geo_h3_10', 'city', 'lat', 'lon'])
df = pd.merge(df, n_bus_stops, how='outer', on='geo_h3_10')
df = pd.merge(df, n_tram_stops, how='outer', on='geo_h3_10')
df = pd.merge(df, n_subway_entrances, how='outer', on='geo_h3_10')
df = pd.merge(df, osm_stops[['geo_h3_10', 'city']].drop_duplicates(), how='outer', on=['geo_h3_10', 'city'])

df.drop_duplicates(subset='geo_h3_10', inplace=True)
df = df.fillna(0)

df.set_index('geo_h3_10', inplace=True)

subway_entrances = df[df['n_subway_entrances'] > 0].index


## Counting distance to the nearest subway entrance

In [58]:
def distance_haversine(point_1: tuple, point_2: tuple):
    d_earth = 2.0 * 6372.8
    lat1, long1 = tuple(radians(c) for c in point_1)
    lat2, long2 = tuple(radians(c) for c in point_2)
    d = sin((lat2 - lat1) / 2.0) ** 2.0 + cos(lat1) * cos(lat2) * sin(
        (long2 - long1) / 2.0) ** 2.0
    return d_earth * asin(d ** 0.5)


def find_nearest(point_1, points):
    dists = [distance_haversine(point_1, p) for p in points]
    dist = min(dists)
    return dist


subway_entrances_geo = [h3.h3_to_geo(tag) for tag in subway_entrances]


def find_dist_from_h3(h3tag):
    lat, lon = h3.h3_to_geo(h3tag)
    return find_nearest((lat, lon), subway_entrances_geo)



In [59]:
df['dist_to_subway'] = [find_dist_from_h3(x) for x in tqdm(df.index)]

100%|██████████| 49875/49875 [00:14<00:00, 3388.41it/s]


## Workarounding missing latitude and longitude

In [60]:
df['lat'] = [h3.h3_to_geo(x)[0] for x in tqdm(df.index)]
df['lon'] = [h3.h3_to_geo(x)[1] for x in tqdm(df.index)]

100%|██████████| 49875/49875 [00:00<00:00, 819411.78it/s]
100%|██████████| 49875/49875 [00:00<00:00, 626697.24it/s]


In [None]:
# df.to_csv('prikol.csv')

# READ CSV

In [None]:
# df = pd.read_csv('prikol.csv')

### Combining all business parameters into one parameter

In [65]:
df['businesses'] = df['Автозапчасти для иномарок'] + df['Авторемонт и техобслуживание (СТО)'] + df[
    'Алкогольные напитки'] + df['Аптеки'] + df['Банки'] + df['Быстрое питание'] + df['Доставка готовых блюд'] + df[
                       'Женская одежда'] + df['Кафе'] + df['Косметика / Парфюмерия'] + df['Ногтевые студии'] + df[
                       'Овощи / Фрукты'] + df['Парикмахерские'] + df['Платёжные терминалы'] + df['Постаматы'] + df[
                       'Продуктовые магазины'] + df['Пункты выдачи интернет-заказов'] + df['Рестораны'] + df[
                       'Страхование'] + df['Супермаркеты'] + df['Цветы'] + df['Шиномонтаж']

df.drop(inplace=True, axis=1, columns=['Автозапчасти для иномарок', 'Авторемонт и техобслуживание (СТО)',
                                       'Алкогольные напитки', 'Аптеки', 'Банки', 'Быстрое питание',
                                       'Доставка готовых блюд', 'Женская одежда', 'Кафе',
                                       'Косметика / Парфюмерия', 'Ногтевые студии', 'Овощи / Фрукты',
                                       'Парикмахерские', 'Платёжные терминалы', 'Постаматы',
                                       'Продуктовые магазины', 'Пункты выдачи интернет-заказов', 'Рестораны',
                                       'Страхование', 'Супермаркеты', 'Цветы', 'Шиномонтаж'])

# For each hex getting data about nearby hexes

In [148]:
def get_nearby_information_for_hexes(sub_df: pd.DataFrame, df: pd.DataFrame, dist: float) -> pd.DataFrame:
    df2 = sub_df.copy(deep=True)
    df2['nearby_population'] = 0
    df2['nearby_n_bus_stops'] = 0
    df2['nearby_n_tram_stops'] = 0
    df2['nearby_n_subway_entrances'] = 0
    df2['nearby_businesses'] = 0
    df2['nearby_atm_cnt'] = 0
    for x in tqdm(sub_df.index.unique()):
        gx = h3.h3_to_geo(x)
        nearby = []
        for y in df.index.unique():
            gy = h3.h3_to_geo(y)
            if x != y and distance_haversine(gx, gy) <= dist:
                nearby.append(y)
        for y in nearby:
            df2.loc[x, 'nearby_population'] += df.loc[y, 'population']
            df2.loc[x, 'nearby_n_bus_stops'] += df.loc[y, 'n_bus_stops']
            df2.loc[x, 'nearby_n_tram_stops'] += df.loc[y, 'n_tram_stops']
            df2.loc[x, 'nearby_n_subway_entrances'] += df.loc[y, 'n_subway_entrances']
            df2.loc[x, 'nearby_businesses'] += df.loc[y, 'businesses']
            df2.loc[x, 'nearby_atm_cnt'] += df.loc[y, 'atm_cnt']
    return df2


def get_nearby_information_for_radius(x: str, dist1: float, dist2: float, df: pd.DataFrame) -> pd.DataFrame:

    gx = h3.h3_to_geo(x)
    nearby = []
    for y in df.index.unique():
        gy = h3.h3_to_geo(y)
        if x != y and distance_haversine(gx, gy) <= dist1:
            nearby.append(y)
    nearby.append(x)

    sub_df = df[df.index.isin(nearby)].copy(deep=True)

    return get_nearby_information_for_hexes(sub_df, df, dist2)

## Getting the best location using recursive function with ML

In [186]:
def get_best_hex_from_df2(df2: pd.DataFrame):
    df2 = df2.copy(deep=True)
    df2.reset_index(inplace=True)
    df2 = df2.drop(['lat', 'lon'], axis=1)
    df2 = df2[df2.nearby_population > 0]

    X = df2[['geo_h3_10', 'dist_to_subway', 'nearby_population', 'nearby_n_bus_stops', 'nearby_n_tram_stops',
             'nearby_businesses']]
    # print(X)

    if X.empty:
        return None


    y = df2['nearby_atm_cnt']
    lr = LinearRegression()

    X_train, X_test, y_train, y_test = train_test_split(X.drop('geo_h3_10', axis=1), y, test_size=0.33, random_state=42)
    lr.fit(X_train, y_train)
    pred = lr.predict(X=X_test).round()


    data = {'model': pred,
            'true': y_test,
            'error': abs(pred - y_test)}

    df = pd.DataFrame(data)

    if df[(df.error >= 1) & (df.true == 0)].sort_values('model', ascending=False).model.empty:
        return None

    ind = df[(df.error >= 1) & (df.true == 0)].sort_values('model', ascending=False).model.index[0]

    best = X.geo_h3_10[ind]

    return best


def MaChInE_LeArNiNg(df2: pd.DataFrame, dist: float):
    best_hex = get_best_hex_from_df2(df2)
    if best_hex is None:
        return None

    df2 = get_nearby_information_for_radius(best_hex, dist, dist - 0.2, df2)
    res = MaChInE_LeArNiNg(df2, dist - 0.2)

    if res is not None:
        return res
    return best_hex



In [193]:
df2_spb = get_nearby_information_for_hexes(df[df.city == "Санкт-Петербург"], df[df.city == "Санкт-Петербург"], 0.6)

100%|██████████| 19101/19101 [20:34<00:00, 15.47it/s]


In [194]:
best_place = MaChInE_LeArNiNg(df2_spb, 0.6)
h3.h3_to_geo(best_place)

ok


100%|██████████| 55/55 [00:02<00:00, 18.53it/s]


ok


100%|██████████| 22/22 [00:00<00:00, 439.79it/s]

ok





(60.05330310392827, 30.341160511496316)