# [Boosters] Raiffeisen Data Cup. Baseline
Общий подход:
- Добавляем к каждой транзакции столбец: is_work (если транзакция находится в пределах 0.02 от дома клиента)
- Добавляем к каждой транзакции столбец: is_home (если транзакция находится в пределах 0.02 от работы клиента)
- Обучаем классификатор предсказывающий вероятность (is_home == 1) для транзакции
- Обучаем классификатор предсказывающий вероятность (is_work == 1) для транзакции

Точность определения местоположения:
- для классификатора is_home: ~3x%
- для классификатора is_work: ~2x%
- общая оценка на Public Leaderboard: ???

Примечание
* Требуется Python версии 3.5
* Требуется библиотека xgboost (для обучения использовалась xgboost версии 0.7.post3)
* Требуются файлы: test_set.csv, train_set.csv в одном каталоге с данным скриптом
* Требования к памяти: должно работать с 2Гб свободного RAM
* Время работы: ~3 минуты (тестировалось на процессоре Intel Core i7-4770)

In [1]:
import pandas as pd
import numpy as np
import datetime

import xgboost as xgb
import sklearn

from sklearn.model_selection import train_test_split

## Производственный календарь

In [2]:
dates = pd.read_csv('./calendar.csv', skiprows=1, index_col=0, sep=';',
                    usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],  
                    names=['year', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'])
# предпраздничные дни
pre_holidays = []
# праздничне дни
holidays = []

for year in range(2015,2019):
    for month in dates.columns:
        for day in dates.loc[year, month].split(','):
            if day.endswith('*'):
                pre_holidays.append(datetime.datetime(year, int(month), int(day[:len(day)-1]), 0, 0))
            else:
                holidays.append(datetime.datetime(year, int(month), int(day), 0, 0))

holidays = set(holidays)
pre_holidays = set(pre_holidays)

# transformate to pretty
def determ(date, hol, pre_hol):
    if date in hol:
        return 2
    elif date in pre_hol:
        return 1
    else: return 0

In [3]:
# Определим типы колонок для экономии памяти
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_adress_lat': np.float32,
    'pos_adress_lon': np.float32,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
}

# для экономии памяти будем загружать только часть атрибутов транзакций
usecols_train = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_adress_lat', 'pos_adress_lon', 'atm_address_lat', 'atm_address_lon','home_add_lat','home_add_lon','work_add_lat','work_add_lon']
usecols_test = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_address_lat', 'pos_address_lon', 'atm_address_lat', 'atm_address_lon']

## Читаем train_set, test_set, соединяем в один датасет

In [4]:
train = pd.read_csv('Reiff_train_set.csv', dtype = dtypes, usecols = usecols_train)
train.rename(columns = {'pos_adress_lat': 'pos_address_lat', 'pos_adress_lon': 'pos_address_lon'}, inplace = True)

test = pd.read_csv('Reiff_test_set.csv', dtype = dtypes, usecols = usecols_test)
submission = pd.DataFrame(test['customer_id'].unique(), columns = ['_ID_'])

# соединяем test/train в одном DataFrame
train['is_train'] = np.int32(1)
test['is_train'] = np.int32(0)
dt = pd.concat([train, test])

del train, test

In [5]:
dt.tail()

Unnamed: 0,amount,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,is_train,mcc,pos_address_lat,pos_address_lon,transaction_date,work_add_lat,work_add_lon
1265465,3.712308,,,CHALTYR,RUS,643.0,159673c2adc0b2be66aa48e9f80f738a,,,0,6011,,,2017-09-16,,
1265466,3.717292,,,ABAKAN,RU,643.0,9c3bb46e374df29c69b44a19cdbf929b,,,0,6011,,,2017-10-15,,
1265467,5.005392,,,CHEBOKSARY,RU,643.0,90309ee8223cddc27349e9f32eb8ad1e,,,0,6011,,,2017-11-04,,
1265468,3.948304,,,Perm,RUS,643.0,3696f3e678c786b53f8aecd9e1a7b647,,,0,6011,,,2017-09-26,,
1265469,3.527621,,,AGEROLA,ITA,978.0,213e6bbf6548701df2a23762c92da044,,,0,6011,,,2017-06-15,,


## Чистим данные

In [6]:
dt['country'] = dt['country'].map(lambda x: x.lower().strip())

In [7]:
dt['country'].value_counts().head()

rus    2466606
ru       15570
deu        522
cze        380
bgr        374
Name: country, dtype: int64

In [8]:
dt['country'] = dt['country'].map(lambda x: x if str(x) != 'ru' else 'rus')

In [9]:
dt['city'] = dt['city'].map(lambda x: str(x).lower().strip())
import re
dt['city'] = dt['city'].map(lambda x: re.sub("[^a-zA-Z]+", "", x))

In [10]:
city_replace = {
    'moskva': 'moscow',
    'sanktpeterbu': 'stpetersburg',
    'stpeterburg': 'stpetersburg',
    'saintpetersb': 'stpetersburg',
    'mo': 'moscowregion',
    'moskovskayao': 'moscowregion',
    'moskovskobl': 'moscowregion',
    'moscowreg': 'moscowregion',
    'moscowobl': 'moscowregion',
    'msk': 'moscow',
    'speterburg': 'stpetersburg',
    'sanktpetersb': 'stpetersburg',
    'stpetersbur': 'stpetersburg',
    'stpete': 'stpetersburg',
    'sanktpeters': 'stpetersburg',
    'spb': 'stpetersburg',
    'spetersburg': 'stpetersburg',
    'saintpeterbu': 'stpetersburg',
    'moskow': 'moscow',
    'gmoskva': 'moscow',
    'moskvag': 'moscow',
    'nvsibr': 'novosibirsk',
    'nizhniynovgo': 'nnovgorod',
    'nizhnynovgor': 'nnovgorod',
    'nizjniynovgo': 'nnovgorod',
    'nizhnynovgor': 'nnovgorod',
    'nizhnovgorod': 'nnovgorod',
    'nigniynovgor': 'nnovgorod',
    'nizhnijnovgo': 'nnovgorod',
    'nijniynovgor': 'nnovgorod',
    'nizhniynovg': 'nnovgorod',
    'rostovondon': 'rostovnadon',
    'rostovnado': 'rostovnadon',
    'rostdn': 'rostovnadon',
    'novorossiiysk': 'novorossiysk',
    'novorossiisk': 'novorossiysk',
    'novorossijsk': 'novorossiysk'
}

In [11]:
dt['city'] = dt['city'].replace(city_replace)

In [12]:
dt['city'].value_counts()

moscow          905177
stpetersburg    491978
novosibirsk      49497
ekaterinburg     47675
nnovgorod        46705
cherepovets      41410
krasnoyarsk      34383
krasnodar        33836
samara           23776
kazan            21751
sochi            18287
moscowregion     17674
novorossiysk     16747
ufa              14995
podolsk          14900
rostovnadon      14539
orel             14303
omsk             14186
voronezh         14176
khimki           13985
chelyabinsk      13930
syktyvkar        13873
petrozavodsk     13246
yaroslavl        12846
kaluga           10898
volgograd        10779
anapa            10546
perm              9916
krasnogorsk       8873
kirov             8781
                 ...  
bobino               1
mocolnechnog         1
krasnyebarri         1
tamansttsa           1
arroyodela           1
joshkarola           1
yamigora             1
irig                 1
tsb                  1
suhoylog             1
denhaag              1
claremont            1
podyachevo 

## Генерируем геофичи клиента

In [13]:
# количество стран и городов каждого клиента
dt = dt.merge(dt.groupby('customer_id')['country'].nunique().reset_index(name = 'ncountries'), how = 'left')
dt['ncountries'] = dt['ncountries'].astype(np.int32)

dt = dt.merge(dt.groupby('customer_id')['city'].nunique().reset_index(name = 'ncities'), how = 'left')
dt['ncities'] = dt['ncities'].astype(np.int32)

In [14]:
dt.tail()

Unnamed: 0,amount,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,is_train,mcc,pos_address_lat,pos_address_lon,transaction_date,work_add_lat,work_add_lon,ncountries,ncities
2490199,3.712308,,,chaltyr,rus,643.0,159673c2adc0b2be66aa48e9f80f738a,,,0,6011,,,2017-09-16,,,1,5
2490200,3.717292,,,abakan,rus,643.0,9c3bb46e374df29c69b44a19cdbf929b,,,0,6011,,,2017-10-15,,,1,7
2490201,5.005392,,,cheboksary,rus,643.0,90309ee8223cddc27349e9f32eb8ad1e,,,0,6011,,,2017-11-04,,,1,7
2490202,3.948304,,,perm,rus,643.0,3696f3e678c786b53f8aecd9e1a7b647,,,0,6011,,,2017-09-26,,,1,2
2490203,3.527621,,,agerola,ita,978.0,213e6bbf6548701df2a23762c92da044,,,0,6011,,,2017-06-15,,,2,6


### Обрабатываем дату транзакции и категориальные признаки

In [15]:
dt['currency'] = dt['currency'].fillna(-1).astype(np.int32)
dt['mcc'] = dt['mcc'].apply(lambda x: int(x.replace(',', ''))).astype(np.int32)
dt['city'] = dt['city'].factorize()[0].astype(np.int32)
dt['country'] = dt['country'].factorize()[0].astype(np.int32)

# удаляем транзакции без даты
dt.drop(dt[dt['transaction_date'].isnull()].index, axis = 0, inplace = True)
dt['transaction_date'] = dt['transaction_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

### Фичи для даты

In [16]:
dt['weekday'] = dt['transaction_date'].dt.weekday.astype(np.int32)

In [17]:
dt['holiday'] = dt['transaction_date'].apply(lambda x: determ(x, holidays, pre_holidays))

### Приводим адрес транзакции для pos и atm-транзакций к единообразному виду

In [18]:
dt['is_atm'] = (~dt['atm_address_lat'].isnull()).astype(np.int32)
dt['is_pos'] = (~dt['pos_address_lat'].isnull()).astype(np.int32)

dt['address_lat'] = dt['atm_address_lat'].fillna(0) + dt['pos_address_lat'].fillna(0)
dt['address_lon'] = dt['atm_address_lon'].fillna(0) + dt['pos_address_lon'].fillna(0)

dt.drop(['atm_address_lat','atm_address_lon','pos_address_lat','pos_address_lon'], axis = 1, inplace = True)

# удалим транзакции без адреса
dt.drop(dt[((dt['address_lon'] == 0) & (dt['address_lon'] == 0))].index, axis = 0, inplace = True)

In [19]:
dt.head()

Unnamed: 0,amount,city,country,currency,customer_id,home_add_lat,home_add_lon,is_train,mcc,transaction_date,work_add_lat,work_add_lon,ncountries,ncities,weekday,holiday,is_atm,is_pos,address_lat,address_lon
0,2.884034,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,1,5261,2017-07-15,59.847,30.177,1,1,5,2,0,1,59.844074,30.179153
1,2.775633,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,1,5261,2017-10-27,59.847,30.177,1,1,4,0,0,1,59.844074,30.179153
2,3.708368,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,1,5992,2017-10-03,59.847,30.177,1,1,1,0,0,1,59.8582,30.229023
3,2.787498,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,1,5261,2017-09-09,59.847,30.177,1,1,5,2,0,1,59.844074,30.179153
4,2.89251,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,1,5261,2017-07-06,59.847,30.177,1,1,3,0,0,1,59.844074,30.179153


### Генерируем признаки is_home, is_work

In [20]:
lat = dt['home_add_lat'] - dt['address_lat']
lon = dt['home_add_lon'] - dt['address_lon']
dt['is_home'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
dt['has_home'] = (~dt['home_add_lon'].isnull()).astype(np.int32)

lat = dt['work_add_lat'] - dt['address_lat']
lon = dt['work_add_lon'] - dt['address_lon']
dt['is_work'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
dt['has_work'] = (~dt['work_add_lon'].isnull()).astype(np.int32)

dt.drop(['work_add_lat','work_add_lon','home_add_lat','home_add_lon'], axis = 1, inplace = True)

### Генерируем категориальный признак для адреса

In [21]:
dt['address'] = dt['address_lat'].apply(lambda x: "%.02f" % x) + ';' + dt['address_lon'].apply(lambda x: "%.02f" % x)
dt['address'] = dt['address'].factorize()[0].astype(np.int32)

### Генерируем несколько абонентских фич

In [22]:
# количество транзакций каждого клиента
dt = dt.merge(dt.groupby('customer_id')['amount'].count().reset_index(name = 'tx'), how = 'left')
dt['tx'] = dt['tx'].astype(np.int32)

dt = dt.merge(dt.groupby(['customer_id','address'])['amount'].count().reset_index(name = 'tx_cust_addr'), how = 'left')
dt['tx_cust_addr'] = dt['tx_cust_addr'].astype(np.int32)

# какая часть транзакций клиента приходится на данный адрес
dt['ratio1'] = dt['tx_cust_addr'] / dt['tx']

## Вспомогательные функции для оценки точности классификатора

In [23]:
def _best(x):
    ret = None
    for col in ys:
        pred = ('pred:%s' % col)
        if pred in x:
            i = (x[pred].idxmax())
            cols = [pred,'address_lat','address_lon']
            if col in x:
                cols.append(col)
            tmp = x.loc[i,cols]
            tmp.rename({
                'address_lat':'%s:add_lat' % col,
                'address_lon':'%s:add_lon' % col,
            }, inplace = True)
            if ret is None:
                ret = tmp
            else:
                ret = pd.concat([ret, tmp])
    return ret

In [24]:
def predict_proba(dt, ys = ['is_home', 'is_work']):
    for col in ys:
        pred = ('pred:%s' % col)
        dt[pred] = model[col].predict_proba(dt[xs])[:,1]
    return dt.groupby('customer_id').apply(_best).reset_index()

In [25]:
def score(dt, ys = ['is_home', 'is_work']):
    dt_ret = predict_proba(dt, ys)
    mean = 0.0
    for col in ys:
        col_mean = dt_ret[col].mean()
        mean += col_mean
    if len(ys) == 2:
        mean = mean / len(ys)
    return mean

### Признаки, на которых будем обучать модель

In [26]:
xs = ['amount','currency','city','country','mcc','is_atm','is_pos','ratio1', 'weekday', 'holiday', 'ncountries', 'ncities']
ys = ['is_home', 'is_work']

# Создаем классификаторы
**Hint**: можно поигратьcя с гиперпараметрами для лучшего результата :)

In [27]:
model0 = {
    'is_home': xgb.XGBClassifier(n_estimators=200, n_jobs=3, learning_rate=0.05, max_depth=9, min_child_weight=9, seed=42),
    'is_work': xgb.XGBClassifier(n_estimators=77, n_jobs=3, learning_rate=0.2, max_depth=9, min_child_weight=9, seed=42),
}

# Обучаем классификаторы

In [28]:
model = {}

# последовательно обучаем два классификатора
for col in ['is_home', 'is_work']:
    
    #выберем для обучение транзакции только тех клиентов из train, у которых хоть в одной транзакции указано место работы/жительства
    cust_train = dt[dt['is_train'] == 1].groupby('customer_id')[col.replace('is_','has_')].max()
    cust_train = cust_train[cust_train > 0].index
    
    #разобъем train на train/valid для валидации
    cust_train, cust_valid = train_test_split(cust_train, test_size = 0.1, shuffle = True, random_state = 2)
    
    train = pd.DataFrame(cust_train, columns = ['customer_id']).merge(dt, how = 'left')
    valid = pd.DataFrame(cust_valid, columns = ['customer_id']).merge(dt, how = 'left')

    print ("Training:", col)
    clf = sklearn.base.clone(model0[col])    
    clf.fit(train[xs], train[col], eval_metric = 'logloss', eval_set = [(train[xs], train[col]), (valid[xs], valid[col])], verbose=10)
    model[col] = clf
    print ("Train accuracy:", score(train, ys = [col]))
    print ("Test accuracy:", score(valid, ys = [col]))
    print ()


Training: is_home
[0]	validation_0-logloss:0.673697	validation_1-logloss:0.673731
[10]	validation_0-logloss:0.552533	validation_1-logloss:0.55457
[20]	validation_0-logloss:0.498334	validation_1-logloss:0.503963
[30]	validation_0-logloss:0.471445	validation_1-logloss:0.480714
[40]	validation_0-logloss:0.456853	validation_1-logloss:0.46998
[50]	validation_0-logloss:0.447858	validation_1-logloss:0.464772
[60]	validation_0-logloss:0.441443	validation_1-logloss:0.46232
[70]	validation_0-logloss:0.436686	validation_1-logloss:0.46145
[80]	validation_0-logloss:0.43295	validation_1-logloss:0.461296
[90]	validation_0-logloss:0.429213	validation_1-logloss:0.461261
[100]	validation_0-logloss:0.425122	validation_1-logloss:0.461385
[110]	validation_0-logloss:0.421645	validation_1-logloss:0.462038
[120]	validation_0-logloss:0.419175	validation_1-logloss:0.462516
[130]	validation_0-logloss:0.415587	validation_1-logloss:0.462315
[140]	validation_0-logloss:0.412474	validation_1-logloss:0.462863
[150]	va

model0 = {
    'is_home': xgb.XGBClassifier(n_estimators=77, n_jobs=3, learning_rate=0.05, max_depth=9, min_child_weight=9, seed=42),
    'is_work': xgb.XGBClassifier(n_estimators=77, n_jobs=3, learning_rate=0.2, max_depth=9, min_child_weight=9, seed=42),
}

Training: is_home
[0]	validation_0-logloss:0.673697	validation_1-logloss:0.673731
[10]	validation_0-logloss:0.552533	validation_1-logloss:0.55457
[20]	validation_0-logloss:0.498334	validation_1-logloss:0.503963
[30]	validation_0-logloss:0.471445	validation_1-logloss:0.480714
[40]	validation_0-logloss:0.456853	validation_1-logloss:0.46998
[50]	validation_0-logloss:0.447858	validation_1-logloss:0.464772
[60]	validation_0-logloss:0.441443	validation_1-logloss:0.46232
[70]	validation_0-logloss:0.436686	validation_1-logloss:0.46145
[76]	validation_0-logloss:0.434503	validation_1-logloss:0.461327
Train accuracy: 0.41255555555555556
Test accuracy: 0.387

Training: is_work
[0]	validation_0-logloss:0.598802	validation_1-logloss:0.605154
[10]	validation_0-logloss:0.365764	validation_1-logloss:0.412241
[20]	validation_0-logloss:0.335969	validation_1-logloss:0.409287
[30]	validation_0-logloss:0.318137	validation_1-logloss:0.415745
[40]	validation_0-logloss:0.303486	validation_1-logloss:0.421908
[50]	validation_0-logloss:0.290031	validation_1-logloss:0.424302
[60]	validation_0-logloss:0.277698	validation_1-logloss:0.428595
[70]	validation_0-logloss:0.266978	validation_1-logloss:0.43906
[76]	validation_0-logloss:0.263655	validation_1-logloss:0.442873
Train accuracy: 0.35092632485997416
Test accuracy: 0.29651162790697677


## Best EVER

model0 = {
    'is_home': xgb.XGBClassifier(n_estimators=200, n_jobs=3, learning_rate=0.05, max_depth=9, min_child_weight=9, seed=42),
    'is_work': xgb.XGBClassifier(n_estimators=77, n_jobs=3, learning_rate=0.2, max_depth=9, min_child_weight=9, seed=42),
}

Training: is_home
[0]	validation_0-logloss:0.673697	validation_1-logloss:0.673731
[10]	validation_0-logloss:0.552533	validation_1-logloss:0.55457
[20]	validation_0-logloss:0.498334	validation_1-logloss:0.503963
[30]	validation_0-logloss:0.471445	validation_1-logloss:0.480714
[40]	validation_0-logloss:0.456853	validation_1-logloss:0.46998
[50]	validation_0-logloss:0.447858	validation_1-logloss:0.464772
[60]	validation_0-logloss:0.441443	validation_1-logloss:0.46232
[70]	validation_0-logloss:0.436686	validation_1-logloss:0.46145
[80]	validation_0-logloss:0.43295	validation_1-logloss:0.461296
[90]	validation_0-logloss:0.429213	validation_1-logloss:0.461261
[100]	validation_0-logloss:0.425122	validation_1-logloss:0.461385
[110]	validation_0-logloss:0.421645	validation_1-logloss:0.462038
[120]	validation_0-logloss:0.419175	validation_1-logloss:0.462516
[130]	validation_0-logloss:0.415587	validation_1-logloss:0.462315
[140]	validation_0-logloss:0.412474	validation_1-logloss:0.462863
[150]	validation_0-logloss:0.410226	validation_1-logloss:0.462982
[160]	validation_0-logloss:0.408178	validation_1-logloss:0.463098
[170]	validation_0-logloss:0.405857	validation_1-logloss:0.463493
[180]	validation_0-logloss:0.40343	validation_1-logloss:0.463837
[190]	validation_0-logloss:0.401658	validation_1-logloss:0.463933
[199]	validation_0-logloss:0.399979	validation_1-logloss:0.464056
Train accuracy: 0.43733333333333335
Test accuracy: 0.395

Training: is_work
[0]	validation_0-logloss:0.598802	validation_1-logloss:0.605154
[10]	validation_0-logloss:0.365764	validation_1-logloss:0.412241
[20]	validation_0-logloss:0.335969	validation_1-logloss:0.409287
[30]	validation_0-logloss:0.318137	validation_1-logloss:0.415745
[40]	validation_0-logloss:0.303486	validation_1-logloss:0.421908
[50]	validation_0-logloss:0.290031	validation_1-logloss:0.424302
[60]	validation_0-logloss:0.277698	validation_1-logloss:0.428595
[70]	validation_0-logloss:0.266978	validation_1-logloss:0.43906
[76]	validation_0-logloss:0.263655	validation_1-logloss:0.442873
Train accuracy: 0.35092632485997416
Test accuracy: 0.29651162790697677

# Predict

In [29]:
cust_test = dt[dt['is_train'] == 0]['customer_id'].unique()
test = pd.DataFrame(cust_test, columns = ['customer_id']).merge(dt, how = 'left')
test = predict_proba(test)
test.rename(columns = {
        'customer_id':'_ID_',
        'is_home:add_lat': '_HOME_LAT_',
        'is_home:add_lon': '_HOME_LON_',
        'is_work:add_lat': '_WORK_LAT_',
        'is_work:add_lon': '_WORK_LON_'}, inplace = True)
test = test[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]

# Формируем submission-файл

In [30]:
# Заполняем пропуски
submission = submission.merge(test, how = 'left').fillna(0)

# Пишем файл submission
submission.to_csv('baseline-tuned2.csv', index = False)

In [31]:
test.head()

Unnamed: 0,_ID_,_WORK_LAT_,_WORK_LON_,_HOME_LAT_,_HOME_LON_
0,00021683ccb416637fe9a4cd35e4606e,55.041073,82.980629,55.041073,82.980629
1,0002d0f8a642272b41c292c12ab6e602,44.029999,42.834999,53.199818,50.173374
2,0004d182d9fede3ba2534b2d5e5ad27e,43.582001,39.720001,43.586273,39.724274
3,0008c2445518c9392cb356c5c3db3392,51.528755,46.04015,52.25695,43.762486
4,000b373cc4969c0be8e0933c08da67e1,56.237175,43.463005,56.319836,43.925976


In [32]:
dt.address.value_counts()

47       16991
387       9050
63        8766
76        8278
392       7878
84        7346
49        7120
769       6657
23        6653
4008      6044
182       5972
845       5656
322       5439
383       5272
73        5205
54        5131
312       5095
2088      4971
1028      4877
62        4846
204       4693
1684      4548
57        4520
96        4410
2822      4406
90        4175
848       4150
603       4122
846       4072
3044      4070
         ...  
9475         1
16321        1
15033        1
19726        1
12984        1
19713        1
16029        1
14275        1
19447        1
15618        1
19723        1
15353        1
19720        1
18945        1
7425         1
19144        1
15046        1
6667         1
17674        1
19145        1
19725        1
12998        1
20429        1
15619        1
4560         1
18944        1
18610        1
20657        1
10431        1
9324         1
Name: address, Length: 20775, dtype: int64