In [1]:
import re
import json
import codecs

from datetime import datetime
from collections import Counter, OrderedDict
from operator import itemgetter

import numpy as np
import pandas as pd

import gmplot
import geopy.distance

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
dtypes = {
    'amount': np.float32,
    'atm_address': str,
    'atm_address_lat': np.float32,
    'atm_address_lon': np.float32,
    'city': str,
    'country': str,
    'currency': np.float32,
    'customer_id': str,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'mcc': str,
    'pos_address': str,
    'pos_address_lat': np.float32,
    'pos_address_lon': np.float32,
    'terminal_id': str,
    'transaction_date': str,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32
}

In [3]:
use_columns = dtypes.keys()

df_1 = pd.read_csv("train_set.csv", sep=',', encoding='utf-8', dtype=dtypes, usecols=use_columns)
df_1["is_train"] = True
df_1.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon,is_train
0,2.884034,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177,True
1,2.775633,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177,True
2,3.708368,,,,St Petersburg,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.8582,30.229023,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177,True
3,2.787498,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177,True
4,2.89251,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177,True


In [4]:
use_columns = filter(lambda k: not (k.startswith('home_') or k.startswith('work_')), dtypes.iterkeys())

df_2 = pd.read_csv("test_set.csv", sep=',', encoding='utf-8', dtype=dtypes, usecols=use_columns)
df_2["mcc"] = df_2["mcc"].map(lambda x: re.sub(',', '', x))
df_2["is_train"] = False
df_2.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,is_train
0,2.211818,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,2-1 TOLMACHEVA STRIVANTEEVKA141280 RUSRUS,55.967487,37.913681,ff0476dae4b098a7b16aabe93d4268df,2017-08-24,False
1,1.331379,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,4111,"SOVETSKIJ, 32IVANTEEVKA141282 RUSRUS",55.971294,37.905186,7cfd9a60282459d4692ecc85b856072e,2017-08-12,False
2,2.608004,,,,PUSHKINO,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,"105,KRASOARMEYSKOE SHPUSHKINO141206 RUSRUS",56.01659,37.9091,7e5a532f0029861d8a9c4f0479b9450b,2017-06-17,False
3,1.916752,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5411,"G. IVANTEEVKA, UL.TOLMACHEVA, D.6IVANTEEVKA141...",55.964508,37.937912,2afe7d1bc61b86c449f413bdf2119032,2017-08-12,False
4,1.981067,,,,MOSCOW,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,5814,5 KOMSOMOLSKAYA SQMOSCOW101000 RUSRUS,55.776802,37.657352,ab4f00601ff1d949afc59ee3f804c79c,2017-04-26,False


In [5]:
df = pd.concat([df_1, df_2], axis=0)

In [6]:
def map_country(country):
    country = country.strip()
    map_c = {
         'AE': 'ARE',
         'AM': 'ARM',
         'AT': 'AUT',
         'AU': 'AUS',
         'AZ': 'AZE',
         'BG': 'BGR',
         'BH': 'BHR',
         'BS': 'BHS',
         'BY': 'BLR',
         'CA': 'CAN',
         'CH': 'CHE',
         'CN': 'CHN',
         'CU': 'CUB',
         'CV': 'CPV',
         'CY': 'CYP',
         'CZ': 'CZE',
         'DE': 'DEU',
         'DK': 'DNK',
         'DN': 'DNK',
         'DO': 'DOM',
         'EE': 'EST',
         'ES': 'ESP',
         'FI': 'FIN',
         'FR': 'FRA',
         'GB': 'GBR',
         'GE': 'GEO',
         'GR': 'GRC',
         'HK': 'HKG',
         'HR': 'HRV',
         'HU': 'HUN',
         'ID': 'IND',
         'IL': 'ISR',
         'IN': 'IND',
         'IS': 'ISL',
         'IT': 'ITA',
         'KG': 'KGZ',
         'KR': 'KOR',
         'KZ': 'KAZ',
         'LK': 'LKA',
         'LT': 'LTU',
         'LU': 'LUX',
         'LV': 'LVA',
         'MD': 'MDA',
         'ME': 'MNE',
         'MK': 'MKD',
         'MV': 'MDV',
         'MY': 'MYS',
         'NL': 'NLD',
         'NO': 'NOR',
         'PH': 'PHL',
         'PL': 'POL',
         'PT': 'PTR',
         'QA': 'QAT',
         'RU': 'RUS',
         'SE': 'SWE',
         'SG': 'SGP',
         'TH': 'THA',
         'TN': 'TUN',
         'TR': 'TUR',
         'TZ': 'TZA',
         'UA': 'UKR',
         'US': 'USA',
         'UZ': 'UZB',
         'VN': 'VNM',
         'ZZ"': '',
         '-bar Campus"': '',
    }
    return map_c[country] if country in map_c else country
    
df["country"] = df["country"].map(map_country)

In [7]:
df_atms = pd.read_csv("data/atm_features.csv", sep=',', encoding='utf-8')
# df_atms.set_index('terminal_id', inplace=True)
# df_atms["city"].fillna('', inplace=True)
df_atms.head()

Unnamed: 0,terminal_id,access24h,bank,city,is_office,lat,lon,n_banks
0,c693dcbafb5e0c1f9a58ac0211d79ed8,False,Россельхозбанк,DERBENT,True,42.056451,48.298047,4
1,3ecc362617966cb7f4563c9e4e89b8a1,True,Россельхозбанк,DERBENT,True,42.056451,48.298047,4
2,cd614b782e35146bacbad19e800e0624,False,Россельхозбанк,DERBENT,True,42.056451,48.298047,4
3,fca99d9df0e01bdebca13824e1db1dfe,False,Россельхозбанк,DERBENT,True,42.056451,48.298047,4
4,fabf56c3e89566e248ea94636cead5cb,False,unknown,,False,42.061501,48.290501,5


In [8]:
df = pd.merge(
    df, df_atms[['terminal_id', 'city']], how='left',
    left_on=['terminal_id'], right_on=['terminal_id'],
    suffixes=('', '_')
)

mask = df["city_"].notnull()
df.loc[mask, "city"] = df.loc[mask, "city_"]
df.drop(labels='city_', axis=1, inplace=True)
df.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,is_train,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
0,2.884034,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5261,,59.844074,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177
1,2.775633,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5261,,59.844074,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177
2,3.708368,,,,St Petersburg,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.8582,30.229023,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177
3,2.787498,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5261,,59.844074,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177
4,2.89251,,,,ST PETERSBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5261,,59.844074,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177


In [9]:
# http://vinfo.russianpost.ru/database/ops.html

df_pochta = pd.read_csv("data/pochta_index.csv", sep=';', encoding='utf-8')
df_pochta.columns = map(lambda s: re.sub(r"[^\w\d]+", "", s), df_pochta.columns)
df_pochta.set_index("INDEX", inplace=True)
df_pochta.head()

Unnamed: 0_level_0,OPSNAME,OPSTYPE,OPSSUBM,REGION,AUTONOM,AREA,CITY,CITY_1,ACTDATE,INDEXOLD
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
101000,МОСКВА,О,127950.0,МОСКВА,,,,,21.01.2011,
101300,МОСКВА-300,ТИ,101000.0,МОСКВА,,,,,21.02.2012,
101700,УФПС Г. МОСКВЫ,УФПС,,МОСКВА,,,,,18.05.2017,
101749,МОСКВА-ДТИ,ДТИ,101000.0,МОСКВА,,,,,09.11.2006,
101751,МОСКВА-ДТИ,ДТИ,101000.0,МОСКВА,,,,,09.11.2006,


In [10]:
class Translit(object):
    translit_table = {
        u'А': 'A',
        u'Б': 'B',
        u'В': 'V',
        u'Г': 'G',
        u'Д': 'D',
        u'Е': 'E',
        u'Ё': 'E',
        u'Ж': 'ZH',
        u'З': 'Z',
        u'И': 'I',
        u'Й': 'J',
        u'К': 'K',
        u'Л': 'L',
        u'М': 'M',
        u'Н': 'N',
        u'О': 'O',
        u'П': 'P',
        u'Р': 'R',
        u'С': 'S',
        u'Т': 'T',
        u'У': 'U',
        u'Ф': 'F',
        u'Х': 'KH',
        u'Ц': 'TS',
        u'Ч': 'CH',
        u'Ш': 'SH',
        u'Щ': 'SHCH',
        u'Ъ': '',
        u'Ы': 'Y',
        u'Ь': '',
        u'Э': 'E',
        u'Ю': 'JU',
        u'Я': 'JA'
    }

    @staticmethod
    def translit(word):
        word = word.upper()
        word = re.sub(ur'ЬЕ', u'YE', word, re.U)
        word = re.sub(ur'ЬИ', u'YI', word, re.U)

        word_ = []
        for c in word:
            if c in Translit.translit_table:
                c = Translit.translit_table[c]
            word_.append(c)
        return ''.join(word_)

print Translit.translit(u'ленинградская обл'.upper())

LENINGRADSKAJA OBL


In [11]:
def translit_region_or_city(region):
    region = re.sub(r'[^\w\s]+', ' ', region.upper(), flags=re.U)
    region = re.sub(r'\d+', ' ', region, flags=re.U)
    
    region = re.sub(u'ИМЕНИ', u'ИМ', region, flags=re.U)
    region = re.sub(u'РЕСПУБЛИКА', '', region, flags=re.U)
    region = re.sub(u'ОБЛАСТЬ', u'ОБЛ', region, flags=re.U)
    region = re.sub(u'АВТОНОМНЫЙ ОКРУГ', u'АО', region, flags=re.U)
    
    if u'ЯКУТИЯ' in region:
        region = u'ЯКУТИЯ'
    elif u'АЛАНИЯ' in region:
        region = u'СЕВЕРНАЯ ОСЕТИЯ'
    elif u'САНКТ ПЕТЕРБУРГ' == region:
        region = u'СТ ПЕТЕРБУРГ'
        
    region = re.sub(r'\s+', ' ', region.strip(), flags=re.U)
    return Translit.translit(region)

In [12]:
spellchekers_dict = set()

for s in df_pochta["OPSNAME"].unique():
    s = re.sub(ur'[^а-яА-Я ]+', ' ', s)
    s = re.sub(r'\s+', ' ', s.strip())
    if len(s.split()) == 1:
        s = translit_region_or_city(s)
        spellchekers_dict.add(s)
        
spellchekers_dict.add(translit_region_or_city(u'БОЛЬШОЙ ИСТОК'))
spellchekers_dict.add(translit_region_or_city(u'ЦЕМДОЛИНА'))

In [13]:
df_pochta.loc[df_pochta["CITY"].isnull(), "CITY"] = df_pochta.loc[df_pochta["CITY"].isnull(), "REGION"]
df_pochta.loc[df_pochta["CITY"].isnull(), "CITY"] = df_pochta.loc[df_pochta["CITY"].isnull(), "AUTONOM"]
df_pochta.loc[df_pochta["REGION"].isnull(), "REGION"] = df_pochta.loc[df_pochta["REGION"].isnull(), "AUTONOM"]
df_pochta.loc[df_pochta["CITY_1"].isnull(), "CITY_1"] = df_pochta.loc[df_pochta["CITY_1"].isnull(), "CITY"]

df_pochta = df_pochta[["REGION", "CITY", "CITY_1"]]
df_pochta.head()

Unnamed: 0_level_0,REGION,CITY,CITY_1
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101000,МОСКВА,МОСКВА,МОСКВА
101300,МОСКВА,МОСКВА,МОСКВА
101700,МОСКВА,МОСКВА,МОСКВА
101749,МОСКВА,МОСКВА,МОСКВА
101751,МОСКВА,МОСКВА,МОСКВА


In [14]:
region_uniq = {i: translit_region_or_city(i) for i in df_pochta["REGION"].unique()}
cities_uniq = {i: translit_region_or_city(i) for i in df_pochta["CITY"].unique()}
cities_small_uniq = {i: translit_region_or_city(i) for i in df_pochta["CITY_1"].unique()}

In [15]:
df_pochta["REGION"] = df_pochta["REGION"].map(lambda x: region_uniq[x])
df_pochta["CITY"] = df_pochta["CITY"].map(lambda x: cities_uniq[x])
df_pochta["CITY_1"] = df_pochta["CITY_1"].map(lambda x: cities_small_uniq[x])

df_pochta.loc[188660:188665]

Unnamed: 0_level_0,REGION,CITY,CITY_1
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
188660,LENINGRADSKAJA OBL,BUGRY,BUGRY
188661,LENINGRADSKAJA OBL,NOVOE DEVJATKINO,NOVOE DEVJATKINO
188662,LENINGRADSKAJA OBL,MURINO,MURINO
188663,LENINGRADSKAJA OBL,KUZMOLOVSKIJ,KUZMOLOVSKIJ
188664,LENINGRADSKAJA OBL,TOKSOVO,TOKSOVO
188665,LENINGRADSKAJA OBL,OSELKI,OSELKI


In [16]:
def correct_names(city):
    city = re.sub('\bNV\b', 'NOVO', city, flags=re.U)
    
    if city in {
        'ADYGEY',
        'RESP ADIGEYA',
        'ADYIGEYA',
        'ADIGEYA',
        'ADYGEJA',
        'ADYGEYA RESP'
    }:
        city = 'ADYGEJA'

    elif city in {
        'ATHENS',
        'ATIENS CENTER',
        'ATIENS REGION'
    }:
        city = 'ATHENS'

    elif city in {
        'AKSAI',
        'AKSAY',
        'AKSAIY',
        'AKSAIY S',
    }:
        city = 'AKSAJ'

    elif city in {
        'ALMETEVSK',
        'ALMETYEVSK'
    }:
        city = 'ALMETYEVSK'

    elif city in {
        'ANTALIA',
        'ANTALYA'
    }:
        city = 'ANTALJA'

    elif city in {
        'ANGARSK',
        'MITYSHCHY'
    }:
        city = 'ANGARSK'

    elif city in {
        'ARHANGEL SK',
        'ARHANGELSK',
        'ARKHANGELSK'
    }:
        city = 'ARKHANGELSK'

    elif city in {
        'ARCHANGELSKOE',
        'ARKHANGELSKOE'
    }:
        city = 'ARKHANGELSKOE'
        
    elif city in {
        'AHTIR',
        'AHTYRSKI',
        'AKHATYRSKIY',
        'AKHTYRSKIY'
    }:
        city = 'AKHTYRSKIJ'

    elif city in {
        'ASTRAHAN',
        'ASTRAKHAN'
    }:
        city = 'ASTRAKHAN'

    elif city in {
        'ATEPTSEVO',
        'ATEPTSEVO VIL'
    }:
        city = 'ATEPTSEVO'

    elif city in {
        'AVSIUNINO',
        'AVSJUNINO',
        'AVSYUNINO',
        'AVSYUNINO DO'
    }:
        city = 'AVSJUNINO'
        
    elif city in {
        'AKSAY',
        'AXAY',
        'AKSAJ'
    }:
        city = 'AKSAY'

    elif city in {
        'BALABANOVA',
        'BALABANOVO',
        'BALABANOVO 1'
    }:
        city = 'BALABANOVO'

    elif 'BALASHIKHA' in city:
        city = 'BALASHIKHA'
    elif city in {
        'BALAKHNA',
        'BALASHIHA',
        'BALASHIKHA',
        'BALASHICHA',
        'BALASHIKA'
    }:
        city = 'BALASHIKHA'

    elif city in {
        'BARNAU',
        'BARNAUL'
    }:
        city = 'BARNAUL'

    elif city in {
        'BARVIHA',
        'BARVIKHA',
        'BARVIKHA D'
    }:
        city = 'BARVIKHA'

    # Пробел важен
    elif 'BELGOROD ' in city:
        city = 'BELGOROD'
        
    elif city in {
        'BELGRAD',
        'BEOGRAD'
    }:
        city = 'BEOGRAD'

    elif city in {
        'FAUSTOVO',
        'BELOOZERSKIY',
        'BELOOZESKIJ'
    }:
        city = 'BELOOZERSKIJ'

    elif city in {
        'BYKOVO',
        'BYKOVO P',
        'BYKOVO RAMENS',
        'BYKOVO RP',
        'BIKOVO'
    }:
        city = 'BYKOVO'

    elif city in {
        'BRONNICE',
        'BRONNICY',
        'BRONNITSI',
        'BRONNITSY'
    }:
        city = 'BRONNITSY'

    elif city in {
        'CHEBOKSARY',
        'CHEBOKSARI',
        'CHEBOKSARYI'
    }:
        city = 'CHEBOKSARY'

    elif city in {
        'CHEREPOVEC',
        'CHEREPOVETS'
    }:
        city = 'CHEREPOVETS'

    elif city in {
        'CHELJABINSK',
        'CHELYABINSK',
        'CHELIABINSK',
        'CHELYABINSK R',
        'CHELYAINSK',
        'CHEKYABINSK',
        'HANTE MANSIIS'
    }:
        city = 'CHELJABINSK'

    elif city in {
        'CHECHOV',
        'CHEKHOV',
        'CHEHOV'
    }:
        city = 'CHEKHOV'

    elif city in {
        'DOLGOPRUDNIY',
        'DOLGOPRUDNYJ',
        'DOLGOPRUDNYY',
        'DOLGOPRUDNII',
        'DOLGOPA'
    }:
        city = 'DOLGOPRUDNYJ'

    elif city in {
        'DZERZHINSKIY',
        'DZERZHINSK'
    }:
        city = 'DZERZHINSKIJ'

    elif city in {
        'ELEKTROSTAL',
        'ELEKTROSTALE'
    }:
        city = 'ELEKTROSTAL'

    elif city in {
        'IRKUTS',
        'IRKUTSK'
    }:
        city = 'IRKUTSK'

    elif city in {
        'ILINSKIY',
        'ILYINSKIY'
    }:
        city = 'ILYINSKIJ'

    elif city in {
        'ISTRINSKII R',
        'LESHKOVO'
    }:
        city = 'LESHKOVO'

    elif city in {
        'FRJAZINO',
        'FRYAZINO'
    }:
        city = 'FRJAZINO'

    elif city in {
        'KAZAN',
        'KAZANE',
        'KAZANC',
        'GOTOD KAZAN',
        'GOROD KAZAN'
    }:
        city = 'KAZAN'

    elif city in {
        'KEMERV',
        'KEMEROVO'
    }:
        city = 'KEMEROVO'

    elif city in {
        'KHABAROVSK',
        'HABAROVSK'
    }:
        city = 'KHABAROVSK'

    # Пробел важен
    elif 'KHIMKI ' in city or city in {
        'KHIMKI',
        'HIMKI',
        'KHIMKI MKR P',
        'KHIMKINSKIY R',
        'KHIMKM'
    }:
        city = 'KHIMKI'

    elif city in {
        'KRASNOJARSK',
        'KRASNOYARSK'
    }:
        city = 'KRASNOJARSK'

    elif city in {
        'KUDROVO',
        'KUDROVO D'
    }:
        city = 'KUDROVO'

    elif city in {
        'KOTELNIKI',
        'KOTELNIKY'
    }:
        city = 'KOTELNIKI'

    elif city in {
        'KOROLEV',
        'KOROLYEV',
        'KOROLEV MKR',
        'KOROLEV MKR Y',
        'KOROLEV POS P',
        'KOROLEVA',
        'KOROLV'
    }:
        city = 'KOROLEV'

    elif city in {
        'LENINGR OBL',
        'LENINGRAD OB',
        'LENINGRAD OBL',
        'LENINGRAD REG',
        'LENINGRADSK O',
        'LENINGRADSKA',
        'LENINGRADSKAY',
        'LENINGRADSKII',
        'LEN OB MIS',
        'LEN OBLAST',
        'L O'
    }:
        city = 'LENINGRADSKAJA OBL'

    elif city in {
        'LIPECK',
        'LIPETSK'
    }:
        city = 'LIPETSK'

    elif city in {
        'LYUBERCY',
        'LYUBERTSY',
        'LYUBERTSI',
        'LYUBERCE',
        'LUBERTSY',
        'LUBERCY',
        'LYUBERCI'
    }:
        city = 'LJUBERTSY'

    elif city in {
        'MALAHOVKA',
        'MALAKHOVKA',
        'MALAKHOVKA P',
    }:
        city = 'MALAKHOVKA'

    elif 'MOSKVA' in city:
        city = 'MOSKVA'
    elif city in {
        'MSK',
        'MOSKOW',
        'MOSCOW',
        'MOSKOW',
        'MOSKWA',
        'MOSOCW',
        'MOSKV',
        'MOSKVA',
        'MOSKVE',
        'MOSKVY',
        'MOSVA',
        '77 MOSCOW',
        'MOSCVA',
        'MOSCOW POSEL',
        'MOSCOW POS',
        'ALTUFEVO',
        'ALTUFIEVO'
        # 'KOMMUNARKA'
        # 'LIPICY'
    }:
        city = 'MOSKVA'

    elif city in {
        'MO',
        'MOSCOW OBL',
        'MOSCOW OBLAST',
        'MOSCOW REG',
        'MOSCOW REGIO',
        'MOSCOW REGION',
        'MOSKOVKAYA OB',
        'MOSKOVSK OBL',
        'MOSKOVSKAYA',
        'MOSKOVSKAYA O',
        'MOSKOVSKAYA R',
        'MOSCOWSKAYA O',
        'MOSKOW REGION',
        'MOS OBL',
        'MOS REGION',
        'MOSKOVSKIIY R',
        'MOSOBL',
        'MOSKOVSKIY RE',
        'MOSKOWSKAYA O',
        'MO AEROPORT'
    }:
        city = 'MOSKOVSKAJA OBL'

    elif city in {
        'MOSKOVSKII',
        'MOSKOVSKIY',
        'MOSKOVSKIY P',
        'MOSKOVSKY',
        'MOSKOVSKYI'
    }:
        city = 'MOSKOVSKIJ'

    elif city in {
        'MOSREN',
        'MOSRENTGEN'
    }:
        city = 'MOSRENTGEN'

    elif city in {
        'MOZHAISK',
        'MOZHAJSK',
        'MOZHAYSK',
        'MOZHAYSKIY R'
    }:
        city = 'MOZHAJSK'

    elif city in {
        'MYTISCHI',
        'MYTISCHY',
        'MYTISHCHI',
        'MYTISHHI',
        'MYTISHI',
        'MYTICSHI',
        'MYTISHY',
        'METISHI',
        'MYTISHCHI'
    }:
        city = 'MYTISHCHI'

    elif city in {
        'NAB CHELNY',
        'NABEREZHNYE C',
        'NABEREZHNYYE',
        'NABER CHELNY',
        'NABCHE',
        'NABEREZH CHEL',
        'NABEREZHNIYE',
        'NABEREZHNUE C',
        'NABEREZHNYIE'
    }:
        city = 'NABEREZHNYE CHELNY'

    elif city in {
        'NAHABINO',
        'NAKHABINO'
    }:
        city = 'NAKHABINO'

    elif city in {
        'N NOVGOROD',
        'NIGNIY NOVGOR',
        'NIJNIY NOVGOR',
        'NIZH NOVGOROD',
        'NIZHNIY NOV',
        'NIZHNIY NOVG',
        'NIZHNIY NOVGO',
        'NIZHNY NOVGOR',
        'NIZJNIY NOVGO',
        'NIZHNIJ NOVGO',
        'NIGHNIY NOVGO',
        'NIJNII NOVGOR',
        'NIZHNII NOVG',
        'NIZHNII NOVGO',
        'NIZHNIIY NOVG',
        'NIZHNIY NO',
        'NIZNIY NOVGOR',
        'NNOVGOROD',
        'NIGNIY NOV',
        'NIZHNIYJ NOVG',
        'NOZHNIY NOVGO',
        'NUGNIY NOVGOR'
    }:
        city = 'NIZHNIY NOVGOROD'
        
    elif city in {
        'NIZVRT',
        'N VARTOVSK',
        'NIZHNEVARTOVSK'
    }:
        city = 'NIZHNEVARTOVSK'

    elif city in {
        'NOVAYA ADYGEY',
        'NOVAYA ADYGEYA',
        'NOV ADYGEYA',
        'AUL NOVAJA AD',
        'AUL NOVAYA AD'
    }:
        city = 'NOVAJA ADYGEJA'

    elif city in {
        'NOVOKUIBESHEV',
        'NOVOKUIBYSHEV',
        'NOVOKUYBYSHEV',
    }:
        city = 'NOVOKUYBYSHEV'

    elif city in {
        'NOVOROSSIISK',
        'NOVOROSSIIYSK',
        'NOVOROSSIJSK',
        'NOVOROSSIYSK',
        'NOVOROSIYSK',
        'NOVOROSSISK',
        'NOVOROSSYSK'
    }:
        city = 'NOVOROSSIYSK'

    elif city in {
        'NVSIBR',
        'NOVOSIBIRSK',
        'NOOSIBIRSK',
        'NOVOSIBISRK'
    }:
        city = 'NOVOSIBIRSK'

    elif city in {
        'ODINCOVO',
        'ODINTSOVO',
        'ODINTSOVSKIY',
        'MO ODINCOVO'
    }:
        city = 'ODINTSOVO'

    elif city in {
        'OREL',
        'ORYEL',
        'ORYOL',
        'LIVNY'
    }:
        city = 'OREL'

    elif city in {
        'OREHOVO ZUEVO',
        'OREKHOVO ZUEV'
    }:
        city = 'OREKHOVO ZUEVO'

    elif city in {
        'PERM',
        'PERME'
    }:
        city = 'PERM'

    elif city in {
        'PETERGOF',
        'PETRODVORETS'
    }:
        city = 'PETERGOF'

    elif city in {
        'PETROZAVODS',
        'PETROZAVODSK'
    }:
        city = 'PETROZAVODSK'

    elif city in {
        'PERESLAVL',
        'PERESLAVL ZAL'
    }:
        city = 'PERESLAVL'
        
    elif city in {
        'AGROLES',
        'PLAVSK'
    }:
        city = 'PLAVSK'

    elif city in {
        'PODLSK',
        'PODOLESK',
        'PODOLSK'
    }:
        city = 'PODOLSK'

    elif city in {
        'RYAZAN',
        'RYAZANE'
    }:
        city = 'RJAZAN'

    elif city in {
        'ROSTOV NA DON',
        'ROSTOV ON DON',
        'ROSTOV N',
        'ROSTOV NA',
        'ROSTOV NA DO',
        'ROSTOV ND',
        'ROSTOV DON'
    }:
        city = 'ROSTOV NA DONU'

    elif city in {
        'C PETERBURG',
        'PETERBURG',
        'S PETER',
        'S PETERBURG',
        'S PETERSBURG',
        'SAINT PETERBU',
        'SAINT PETERSB',
        'SAN PETERBURG',
        'SANK PETERBUR',
        'SANKT PERERBU',
        'SANKT PETE',
        'SANKT PETEBUR',
        'SANKT PETER',
        'SANKT PETERBO',
        'SANKT PETERBU',
        'SANKT PETERS',
        'SANKT PETERSB',
        'SANKT PETERUB',
        'SANKT PRTRRBU',
        'SANKT PTERBUR',
        'SANQT PETERBU',
        'SANT PETERBUR',
        'SNPETERBURG',
        'SPETERBURG',
        'SPETERSBURG',
        'SPETERSRURG',
        'SPB',
        'SPB POS SHUSH',
        'ST PERESBURG',
        'ST PETER',
        'ST PETERB',
        'ST PETERBUR',
        'ST PETERBURG',
        'ST PETERSBUR',
        'ST PETERSBURG',
        'ST PETERURG',
        'ST PETESBURG',
        'ST PETRESBURG',
        'ST PETRSBURG',
        'STPETERBURG',
        'STPETERSBURG',
        'CPB',
        'STPETE',
        'A KUZNETSOV'
        # 'KOLPINO'
        # 'KRASNOE SELO'
        # 'KRONSHTADT'
    }:
        city = 'ST PETERBURG'

    elif city in {
        'SERPUHOV',
        'SERPUKHOV'
    }:
        city = 'SERPUKHOV'

    elif 'SAMARA ' in city:
        city = 'SAMARA'

    elif city in {
        'RAMENSKOE',
        'RAMENSKOYE',
        'RAMENSKIY M',
        'FABRICHNAJA',
        'FABRICHNAYA'
    }:
        city = 'RAMENSKOE'
        
    elif city in {
        'UDELNAYA',
        'RAMENSKII'
    }:
        city = 'UDELNAJA'

    elif city in {
        'SESTRORECK',
        'SESTRORETSK'
    }:
        city = 'SESTRORETSK'

    elif city in {
        'SCHELKOVO',
        'SHCHEGLOVO',
        'SHELKOVO',
        'SHCHELKOVO 3',
        'SHCHELKOVO'
    }:
        city = 'SHCHELKOVO'

    elif city in {
        'SCHERBINKA',
        'SHCHERBINKA'
    }:
        city = 'SHCHERBINKA'

    elif 'SOCHI ' in city or city in {
        'ADLER',
        'ADLE',
        'ADLER POS K',
        'SOCHI',
        'SOCHY'
    }:
        city = 'SOCHI'

    elif city in {
        'SOLNETCHNOGOR',
        'SOLNECHNOGORS'
    }:
        city = 'SOLNECHNOGORSK'

    elif city in {
        'STAREI OSKOL',
        'STARYY OSKOL',
        'STARIY OSKOL',
        'OSKOL',
        'S OSKOL'
    }:
        city = 'STARYJ OSKOL'

    elif city in {
        'SYIKTYIVKAR',
        'SYKTYVKAR'
    }:
        city = 'SYKTYVKAR'

    elif city in {
        'TOGLIATTI',
        'TOLYATTI',
        'TOLJATTI',
        'TOLEYATTI'
    }:
        city = 'TOLJATTI'

    elif city in {
        'TROICK',
        'TROITSK'
    }:
        city = 'TROITSK'

    elif city in {
        'TEMRJUK',
        'TEMRYUK',
    }:
        city = 'TEMRJUK'

    elif city in {
        'TYUMEN',
        'TUMEN',
        'TJUMEN',
    }:
        city = 'TJUMEN'

    elif city in {
        'VEL NOVGOROD',
        'VELIKIE LUKI',
        'VELIKIY NOVGO',
        'VELIKY NOVGOR',
        'VELIKIIY NOVG',
        'VELIKIJ NOVGO',
        'VELIKIY NOV',
        'VELIKIY NOVGO',
        'VELIKY NOVGOR',
        'V NOVGOROD',
        'NOVGOROD VEL',
        'NOVGOROD'
    }:
        city = 'VELIKIJ NOVGOROD'

    elif city in {
        'VORONEJ',
        'VORONEZH',
        'VORONZ',
        'VORONEG'
    }:
        city = 'VORONEZH'

    elif city in {
        'VSEVOLOJSK',
        'VSEVOLOZHSK'
    }:
        city = 'VSEVOLOZHSK'

    elif city in {
        'YAROSLAVLE',
        'YAROSLAVL'
    }:
        city = 'JAROSLAVL'

    elif city in {
        'YEKATR',
        'YEKATERINBURG',
        'EKA BURG',
        'EKATERINB',
        'EKATERINBOURG',
        'EKATERINBUR',
        'EKATERINBURTG',
        'EKATERINBYRG',
        'EKATERNIBURG',
        'EKATERUBBYRG',
        'EKATR',
        'EKETERINBURG',
        'EKT',
        'ETATERINBURG'
    }:
        city = 'EKATERINBURG'

    elif city in {
        'ZELENOGRAD KR',
        'ZELENOGRAD'
    }:
        city = 'ZELENOGRAD'

    elif city in {
        'ZHELEZN',
        'ZHELEZN NII',
        'ZHELEZNOD',
        'ZHELEZNOD II',
        'ZHELEZNODOR',
        'ZHELEZNODORO',
        'ZHELEZNODOROD',
        'ZHELEZNODOROZ',
    }:
        city = 'ZHELEZNODOROZHNYJ'

    elif city in {
        'ZHUKOVSKIY',
        'ZHUKOVSKY',
        'ZHUKOVSKII',
        'ZHUKOVSKIIY',
        'ZHUKOVSKIJ',
        'ZHUKOSKIY'
    }:
        city = 'ZHUKOVSKIJ'

    elif city in {
        'NE ZADAN',
        'RUSSIA',
        'RUSSIAN FEDER',
        'RF',
        'UNKNOWN'
    }:
        city = ''

    return city

In [17]:
def default_modify_city(c):
    c = re.sub(r"'", '', c, flags=re.U)
    c = re.sub(r"[^\w\d\s]", " ", c, flags=re.U)
    c = re.sub(r"\d+", " ", c, flags=re.U)
    c = re.sub(r"\s+", " ", c, flags=re.U)
    c = c.strip().upper()
    return c


def modify_city(c):
    try:
        c = default_modify_city(c)
        
        c = re.sub(r'^M O ', '', c, flags=re.U)
        c = re.sub(r'^R N ', '', c, flags=re.U)
        c = re.sub(r' R N$', '', c, flags=re.U)
        
        # Удаляем город
        c = re.sub(r"\bG\b", "", c, flags=re.U)
        
        # Удаляем деревня
        c = re.sub(r"\bD\b", "", c, flags=re.U)
        
        # Удаляем поселок
        c = re.sub(r"\bP\b", "", c, flags=re.U)
        
        # Удаляем городской округ
        c = re.sub(r"\bGO\b", "", c, flags=re.U)
        
        c = correct_names(c)
        
        c_splited = c.split()
        c_splited = filter(
            lambda x: x not in {
                'DIS', 'DER', 'OKR', 'D', 'G', 'VILL', 'RESP', 'MKAD',
                'KM', 'MO', 'POSELOK', 'POS', 'S', 'SELO', 'PGT'
            }, c_splited)
        c_splited = map(
            lambda x: x if x not in {'REGION', 'REGIO', 'REGI', 'REG', 'RE'}\
                else 'OBL', c_splited
        )
        c = ' '.join(c_splited)
    except TypeError:
        c = ''
    return c

df["city"] = df["city"].map(modify_city)
df.loc[df["city"].isnull(), "city"] = ""

In [18]:
class Tie(object):
    def __init__(self, is_final=False, depth=0, threshold=1.6, reverse=False):
        self.is_final = is_final
        self.threshold = threshold
        self.reverse = reverse

        self.d = dict()
        self.depth = depth
        self.cache = dict()

        self.complex_pairs = {'ZH', 'KH', 'TS', 'CH', 'SH', 'HC', 'JU', 'JA'}
        self.substitutions = [
            ('CSH', 'SHCH'),
            ('SCH', 'SCHC'),
            ('SHH', 'SHCH'),
            ('SH', 'SHCH'),
            ('CH', 'KH'),
            ('IA', 'JA'),
            ('II', 'YJ'),
            ('IU', 'JU'),
            ('YU', 'JU'),
            ('IY', 'YJ'),
            ('IY', 'JA'),
            ('YA', 'JA'),
            ('YU', 'JU'),
            ('LI', 'LYI'),
            ('NV', 'NOVO'),
            ('YE', 'E'),
            ('YI', 'YJ'),
            ('YY', 'YJ'),
            ('YO', 'E'),
            ('ZH', 'Z'),
            ('A', 'O'),
            ('C', 'TS'),
            ('B', 'V'),
            ('E', 'I'),
            ('G', 'ZH'),
            ('H', 'KH'),
            ('I', 'E'),
            ('I', 'J'),
            ('I', 'Y'),
            ('J', 'ZH'),
            ('K', 'KH'),
            ('Y', 'I'),
            ('Y', 'U'),
            ('Y', 'J'),
            ('X', 'KH'),
            ('X', 'KS')
        ]

        if reverse:
            self.complex_pairs = set([s[::-1] for s in self.complex_pairs])
            self.substitutions = [(a[::-1], b[::-1]) for a, b in self.substitutions]

    def add(self, word):
        if self.reverse:
            word = word[::-1]

        curr = self
        for ch in word:
            if ch not in curr.d:
                curr.d[ch] = Tie(depth=curr.depth + 1, reverse=self.reverse)
            curr = curr.d[ch]
        curr.is_final = True

    def __contains__(self, word):
        curr = self
        for ch in word:
            if ch in curr.d:
                curr = curr.d[ch]
            else:
                return False
        return curr.is_final

    def find_words_with_prefix(self, prefix):
        items = []

        prefix_i = 0
        curr = self

        for ch in prefix:
            if ch in curr.d:
                prefix_i += 1
                curr = curr.d[ch]
            else:
                break

        if prefix_i == len(prefix):
            for ch in curr.d.iterkeys():
                found = curr.d[ch].find_words_with_prefix('')
                found = map(lambda x: ch + x, found)
                items.extend(found)

            if curr.is_final:
                items.append('')

            items = map(lambda x: prefix + x, items)

        items = sorted(items, key=lambda x: (len(x), x))
        return items

    def find_prefix(self, word):        
        prefix_i = 0

        curr = self
        for ch in word:
            if ch in curr.d:
                prefix_i += 1
                curr = curr.d[ch]
            else:
                break

        return word[:prefix_i]

    def find(self, word):
        if not self.reverse:
            word = correct_names(word)
        else:
            word = word[::-1]

        if word not in self.cache:
            result = self.find_process(word)
            result = sorted(
                result,
                key=lambda x: (x[1], abs(len(x[0]) - len(word)))
            )
            result = result[:5]
            if self.reverse:
                result = [(r[::-1], s) for r, s in result]

            self.cache[word] = result
        else:
            result = self.cache[word]
        return result

    def check_substitution(self, word, a, b):
        return word[:len(a)] == a and self.find_prefix(b) == b

    def get_node_for_path(self, path):
        curr = self
        for ch in path:
            curr = curr.d[ch]
        return curr

    def find_process(self, word, mistakes=0):
        penalty = 0.3 * mistakes
        entries = []

        if mistakes > self.threshold:
            pass
        elif len(word) > 0:
            if 0.75 * (self.depth + len(word)) < self.depth and self.is_final:
                entries.append(('', mistakes + 0.2 + \
                                float(self.depth + len(word)) / self.depth))

            for a, b in self.substitutions:
                if self.check_substitution(word, a, b):
                    curr = self.get_node_for_path(b)
                    found = curr.find_process(word[len(a):], mistakes + 0.1 + penalty)
                    found = map(lambda x: (b + x[0], x[1]), found)
                    entries.extend(found)

            ch_ = word[0]
            for ch in self.d.iterkeys():
                if ch == ch_:
                    # правильная буква
                    found = self.d[ch].find_process(word[1:], mistakes)
                    found = map(lambda x: (ch + x[0], x[1]), found)
                    entries.extend(found)
                else:
                    # заменяем букву на текущую
                    found = self.d[ch].find_process(word[1:], mistakes + 1 + penalty)
                    found = map(lambda x: (ch + x[0], x[1]), found)
                    entries.extend(found)
                    # текущая буква была пропущена
                    p = 0.1 if ch + word[0] in self.complex_pairs else 0.5
                    if ch_ == ' ':
                        p = 0.2 if len(self.d) > 1 else 0.1
                    found = self.d[ch].find_process(word, mistakes + p + penalty)
                    found = map(lambda x: (ch + x[0], x[1]), found)
                    entries.extend(found)

            # буква слова -- опечатка
            found = self.find_process(word[1:], mistakes + 1 + penalty)
            entries.extend(found)

            found = dict()
            for entry, rank in entries:
                if rank > self.threshold:
                    continue
                if entry not in found:
                    found[entry] = np.inf
                found[entry] = min(found[entry], rank)
            entries = list(found.iteritems())
        else:
            # буквы кончились
            if self.is_final:
                entries.append(('', mistakes))

            # не хватает букв в конце
            if len(self.d) > 1:
                for ch in self.d.iterkeys():
                    found = self.d[ch].find_process('', mistakes + 0.2 + penalty)
                    found = map(lambda x: (ch + x[0], x[1]), found)
                    entries.extend(found)
            elif len(self.d) == 1:
                ch = self.d.keys()[0]
                found = self.d[ch].find_process(word[1:], mistakes + 0.1 + penalty)
                found = map(lambda x: (ch + x[0], x[1]), found)
                entries.extend(found)

        return entries

In [19]:
spellchecker = Tie()

for elem in region_uniq.itervalues():
    spellchecker.add(elem)
    
for elem in cities_uniq.itervalues():
    spellchecker.add(elem)
    
for elem in cities_small_uniq.itervalues():
    spellchecker.add(elem)
    
for elem in spellchekers_dict:
    spellchecker.add(elem)
    
spellchecker.find(u'MOSKVA')

[(u'MOSKVA', 0),
 (u'MOSKOVO', 0.75),
 (u'MOSKOVKA', 1.15),
 (u'MOSKVINA', 1.15),
 (u'MOSHKOVO', 1.595)]

In [20]:
spellchecker_reverse = Tie(reverse=True)

for elem in region_uniq.itervalues():
    spellchecker_reverse.add(elem)
    
for elem in cities_uniq.itervalues():
    spellchecker_reverse.add(elem)
    
for elem in cities_small_uniq.itervalues():
    spellchecker_reverse.add(elem)
    
for elem in spellchekers_dict:
    spellchecker_reverse.add(elem)
    
spellchecker_reverse.find(u'MOSKVA')

[('MOSKVA', 0),
 ('MOSKOVO', 0.63),
 ('MOSHKOVO', 0.919),
 (u'MOSKVINA', 1.15),
 (u'MOSKOVKA', 1.15)]

In [21]:
processed_adresses = dict()

def proccess_index_case(index):
    try:
        index = int(index)        
        r_value = df_pochta.loc[index, "CITY_1"]
        return r_value
    except KeyError:
        return None


def find_tuple_in_index(city):
    if city in found_by_index:
        counter = found_by_index[city]
        return counter.most_common(1)[0][0]
    else:
        return None


def process_common_case(city):
    city = spellchecker.find(city)
    return city[0][0] if len(city) > 0 else None


def process_reversed_case(city):
    # print 'Z', city
    city_ = city
    city = spellchecker_reverse.find_words_with_prefix(city[::-1])
    city = sorted(map(lambda x: x[::-1], city), key=lambda x: (len(x), x))
    # print 'A', city
    if len(city) == 0:
        city = spellchecker_reverse.find(city_)
        city = map(itemgetter(0), city)
        # print 'B', city
    return city if len(city) > 0 else None


def process_russian_case(address):
    address = re.sub(r'"', '', address, flags=re.U)
    city = address.split(',')[0]
    city = translit_region_or_city(city)
    return city


def process_address(address, city_def, log=False):
    if address in processed_adresses:
        return processed_adresses[address]
    
    try:
        city_final = None
        city_variants = None
        
        is_russian = re.search(ur'[а-яА-Я]', address, flags=re.U) is not None
        if city_final is None and is_russian:
            city_final = process_russian_case(address)
        # print 0, city_final
        
        found_city = re.findall(r'g\.\s*([^\\,]+)', address, flags=re.U)
        if city_final is None and len(found_city) > 0:
            city = found_city[0]
            city = default_modify_city(city)
            city_final = process_common_case(city)
        # print 1, city_final
        
        prefix_str = re.search(r'^[\w-]+\s*,', address, flags=re.U)
        if city_final is None and prefix_str is not None:
            city = prefix_str.group(0)[:-1]
            city = default_modify_city(city)
            # print 1.5, city
            if len(city) > 4:
                city_variants = process_reversed_case(city)
    
        index = re.search(r'[1-9]\d{5}', address, flags=re.U)
        if city_final is None and index is not None:
            index = index.group(0)
            city_final = proccess_index_case(index)
            if city_variants is not None and city_final not in city_variants:
                city_final = city_variants[0]
        # print 2, city_final

        city = re.findall(r'\\([^\\]+)', address, flags=re.U)
        if city_final is None and len(city) > 0:
            city_final = process_common_case(city[0])
        # print 3, city_final
    except TypeError:
        city_final = None
        
    if city_final is None:
        city_final = process_common_case(city_def)
    
    city_final = city_final if city_final is not None else ''
    processed_adresses[address] = city_final
    
    if log:
        print u"'{}'\t'{}'\t->\t'{}'".format(address, city_def, city_final)
    return city_final

In [22]:
def address_has_index(address):
    try:
        return re.search(r'[1-9]\d{5}', address, flags=re.U) is not None
    except TypeError:
        return False

In [23]:
df["city_"] = ''

In [24]:
mask_russian = df["country"] == "RUS"
df.loc[np.logical_not(mask_russian), "city_"] = df.loc[np.logical_not(mask_russian), "city"]

In [25]:
cities_uniq_spellcheker = sorted(df.loc[mask_russian, "city"].unique())

cities_not_found_list = set(region_uniq.itervalues())
cities_correct_list = set()

for city in cities_uniq_spellcheker:
    if city == '':
        continue
        
    if city in spellchecker:
        cities_correct_list.add(city)
        continue

    city_m = spellchecker.find(city)
    if len(city_m) > 0:
        city_m, city_w = city_m[0]
        print "'{}' -> '{}' [{}]".format(city, city_m, city_w)
        cities_correct_list.add(city_m)
    else:
        cities_not_found_list.add(city)
        print "'{}' not found".format(city)
        
cities_not_found_list.add('')
cities_not_found_list.discard('MOSKVA')
cities_not_found_list.discard('ST PETERBURG')
cities_not_found_list.discard('MOSKOVSKAJA OBL')
cities_not_found_list.discard('LENINGRADSKAJA OBL')
cities_not_found_list.discard('SEVASTOPOL')

'A M' -> 'AIM' [1.0]
'A VO' -> 'AROVO' [1.26]
'ABINSK OBL' not found
'ABRAMCEVO' -> 'ABRAMTSEVO' [0.1]
'ABRAU DYURSO' -> 'ABRAU DJURSO' [0.1]
'AD MOS' not found
'ADYGEYA' -> 'ADYGEJA' [0.1]
'ADYGEYSK' -> 'ADYGEJSK' [0.1]
'AEROPORT DOMO' not found
'AEROPORT KRAS' not found
'AEROPORT VORO' not found
'AEROPORTA DOM' not found
'AFIPSKIY' -> 'AFIPSKIJ' [0.1]
'AFON' -> 'AFONINO' [0.568]
'AGAFONIKHA' not found
'AGALATOVSKOE' not found
'AGOY' -> 'AGOJ' [0.1]
'AGROGOROD' -> 'AGROGORODOK' [0.23]
'AHTIR' -> 'AKHTYRSKIJ' [0]
'AHTUBINSK' -> 'AKHTUBINSK' [0.1]
'AHTYRSKI' -> 'AKHTYRSKIJ' [0]
'AISHA' -> 'AJSHA' [0.1]
'AKHLEBINO' -> 'KHLEBINO' [1.0]
'AKHTANIZOVSKA' -> 'AKHTANIZOVSKAJA' [0.23]
'AKSAJ OBL' not found
'AKSAY' -> 'AKSAJ' [0]
'AKSENO BUTYRS' not found
'AKYAR' -> 'AKJAR' [0.1]
'ALABINO' -> 'KALABINO' [0.5]
'ALABUSHEVO SO' -> 'ALABUSHEVO' [1.5]
'ALEKHOVSHCHIN' -> 'ALEKHOVSHCHINA' [0.1]
'ALEKSAND' -> 'ALEKSANDROV' [0.629]
'ALEKSANDRIYSK' -> 'ALEKSANDRIJSKIJ' [0.529]
'ALEKSANDRO NE' -> 'ALEKSAND

In [26]:
mask_not_found = [c in cities_not_found_list for c in df["city"]]

In [27]:
def spellchecker_modify(city):
    try:
        city = spellchecker.find(city)
        if len(city) > 0:
            return city[0][0]
        else:
            return ''
    except TypeError:
        return ''

mask = np.logical_and(np.logical_not(mask_not_found), mask_russian)
df.loc[mask, "city_"] = df.loc[mask, "city"].map(spellchecker_modify)

In [28]:
mask_curr = np.logical_and(mask_russian, mask_not_found)

def modify_cities_address_by_mask(mask_curr):
    mask = np.logical_and(df["atm_address"].notnull(), mask_curr)
    df.loc[mask, "city_"] = df[mask].apply(
        lambda x: process_address(x['atm_address'], x['city'], True), axis=1)

    mask = np.logical_and(df["pos_address"].notnull(), mask_curr)
    df.loc[mask, "city_"] = df[mask].apply(
        lambda x: process_address(x['pos_address'], x['city'], True), axis=1)

    mask = np.logical_and(df["pos_address"].isnull(), df["atm_address"].isnull())
    mask = np.logical_and(mask, mask_curr)
    df.loc[mask, "city_"] = df[mask].apply(
        lambda x: process_address(x['atm_address'], x['city'], True), axis=1)
    
modify_cities_address_by_mask(mask_curr)

'.ZONA\GPB KRASNOJAR\660049    RUSRUS'	'GPB KRASNOJAR'	->	'KRASNOJARSK'
'EBR.PRUDI\             RUS'	'SEREBR PRUDI'	->	''
'kaya, 35      >  Esto-\Esto-Sadok\00000     RUSRUS'	'ESTO SADOK'	->	''
'ICHA 30\PERVRL\623100    RUSRUS'	'PERVRL'	->	'PERVOURALSK'
'TYABRSKAYA 20A\NOVALT\658087    RUSRUS'	'NOVALT'	->	'NOVOALTAJSK'
'TUTINA 44\PERVRL\623100    RUSRUS'	'PERVRL'	->	'PERVOURALSK'
'. PABRATIMO\127018    RUSRUS'	'ST PABRATIMO'	->	'MOSKVA'
'R\NOVOSHCHERBIN\353632    RUSRUS'	'NOVOSHCHERBIN'	->	'NOVOSHCHERBINOVSKAJA'
'KM KASHIRSKOGO SH.\APARNIKY\142715    RUSRUS'	'APARNIKY'	->	'SOVKHOZ IM LENINA'
'ONO\140155    RUSRUS'	'NOVOKHARITONO'	->	'ELEKTROIZOLJATOR'
'EREG STR\SYUKTERKA\429526    RUSRUS'	'SYUKTERKA'	->	'KHYRKASY'
'Электросталь, пос. Случайный, Массив 1'	'SLUCHAJNYJ'	->	'ELEKTROSTAL'
'STAN ST\ZELDLS\420126    RUSRUS'	'ZELDLS'	->	'KAZAN'
'BURTULINOVKA\397500    RUSRUS'	'BURTULINOVKA'	->	'BUTURLINOVKA'
'G 100 LET VLADIVOSTOKU PR\VLADVS\690090    RUSRUS'	'VLADVS'	->	'VLADIVOSTOK'
'atornay

In [29]:
spellchecker = Tie()
spellchecker_reverse = Tie(reverse=True)

mask_msc_obl = np.logical_or(
    df_pochta["REGION"] == 'MOSKOVSKAJA OBL',
    df_pochta["REGION"] == 'MOSKVA'
)

for item in df_pochta.loc[mask_msc_obl, "CITY"].unique():
    spellchecker.add(item)
    spellchecker_reverse.add(item)
    
for item in df_pochta.loc[mask_msc_obl, "CITY_1"].unique():
    spellchecker.add(item)
    spellchecker_reverse.add(item)

mask_msc_obl = df["city_"] == 'MOSKOVSKAJA OBL'
mask_curr = np.logical_and(mask_msc_obl, mask_russian)

modify_cities_address_by_mask(mask_curr)

'ja oblast', Solnechnogorskij rajon, d. Elino, str.20/2\MO\141580    46 RUS'	'MOSKOVSKAJA OBL'	->	'LUNEVO'
'dol'sk, ul. Klementa Gotval'da, 8\MO\142110    46 RUS'	'MOSKOVSKAJA OBL'	->	'PODOLSK'
'sovskij r-n, selskoe pos.Nazare\MO\143907    46 RUS'	'MOSKOVSKAJA OBL'	->	'BALASHIKHA'
'ja oblast', g. Himki, kvartal Mezhdunarodnyj, ul. Pokr\MO\141400    46 RUS'	'MOSKOVSKAJA OBL'	->	'KHIMKI'
'AVSKOGO SH\MOSKOVSKAYA O\142718    RUSRUS'	'MOSKOVSKAJA OBL'	->	'IZMAJLOVO'
'aja obl., Krasnogorskij r-n, vblizi p. Svetlye Gory\MO\143442    46 RUS'	'MOSKOVSKAJA OBL'	->	'OTRADNOE'
'mskij r-n, p. Verbilki, ul. Zab\MO\143907    46 RUS'	'MOSKOVSKAJA OBL'	->	'BALASHIKHA'
'TYSHI NOVOMITYSHINSKY PR 34/2 OTDELENYE\MOSCOW REGION\0904         RUS'	'MOSKOVSKAJA OBL'	->	''
'anteevka, ul. Zarechnaya, d. 2\MO\143907    46 RUS'	'MOSKOVSKAJA OBL'	->	'IVANTEEVKA'
'ja oblast', Odincovskij rajon, poselok Trehgorka, ulic\MO\143005    46 RUS'	'MOSKOVSKAJA OBL'	->	'ODINTSOVO'
'tishhi, Novomytishhinskij pr-t, d. 3/12\MO\14

In [30]:
spellchecker = Tie()
spellchecker_reverse = Tie(reverse=True)

cities_not_found_list.discard('ST PETERBURG')
cities_not_found_list.discard('LENINGRADSKAJA OBL')

mask_spb_obl = np.logical_or(
    df_pochta["REGION"] == 'ST PETERBURG',
    df_pochta["REGION"] == 'LENINGRADSKAJA OBL'
)

for item in df_pochta.loc[mask_spb_obl, "CITY"].unique():
    spellchecker.add(item)
    spellchecker_reverse.add(item)
    
for item in df_pochta.loc[mask_spb_obl, "CITY_1"].unique():
    spellchecker.add(item)
    spellchecker_reverse.add(item)

mask_spb_obl = df["city_"] == 'LENINGRADSKAJA OBL'
mask_curr = np.logical_and(mask_spb_obl, mask_russian)

modify_cities_address_by_mask(mask_curr)

'I STR\LENINGRADSKAY\353740    RUSRUS'	'LENINGRADSKAJA OBL'	->	'LENINGRADSKAJA'
'IZII STR BLD A\LENINGRADSKAY\353740    RUSRUS'	'LENINGRADSKAJA OBL'	->	'LENINGRADSKAJA'
'VOSARATOVKA-CENTR\LENINGRADSKAY\188680    RUSRUS'	'LENINGRADSKAJA OBL'	->	'KOLTUSHI'
'STR\LENINGRADSKAY\353740    RUSRUS'	'LENINGRADSKAJA OBL'	->	'LENINGRADSKAJA'
'BLD A\LENINGRADSKAY\353740    RUSRUS'	'LENINGRADSKAJA OBL'	->	'LENINGRADSKAJA'
'STR BLD ZH\LENINGRADSKAY\353740    RUSRUS'	'LENINGRADSKAJA OBL'	->	'LENINGRADSKAJA'
'NINGRADSKAY OBL. VOLHONSKOE SHOSSE 4\LENINGR. OBL.\0573         RUS'	'LENINGRADSKAJA OBL'	->	''
'A STR\LENINGRADSKAY\353740    RUSRUS'	'LENINGRADSKAJA OBL'	->	'LENINGRADSKAJA'
'STR BLD A-5\LENINGRADSKAY\353740    RUSRUS'	'LENINGRADSKAJA OBL'	->	'LENINGRADSKAJA'
'VIZII, 115A\LENINGRADSKAY\350000    RUSRUS'	'LENINGRADSKAJA OBL'	->	'KRASNODAR'
'127, CHERNETSKOELENINGRAD REG187439    RUSRUS'	'LENINGRADSKAJA OBL'	->	'KOLCHANOVO'
'BUGRYLENINGRADSKAY188660    RUSRUS'	'LENINGRADSKAJA OBL'	->	'BUGRY'
'UL 

In [31]:
map_term_to_city = {
    term : group["city_"].value_counts().idxmax()
    for term, group in df[mask_russian].groupby("terminal_id")
}

df.loc[mask_russian, "city_"] = df.loc[mask_russian, "terminal_id"].map(
    lambda x: map_term_to_city[x])

In [32]:
df.iloc[0:5]

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,is_train,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon,city_
0,2.884034,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5261,,59.844074,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177,ST PETERBURG
1,2.775633,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5261,,59.844074,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177,ST PETERBURG
2,3.708368,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.8582,30.229023,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177,ST PETERBURG
3,2.787498,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5261,,59.844074,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177,ST PETERBURG
4,2.89251,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5261,,59.844074,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177,ST PETERBURG


In [33]:
df.loc[mask_russian, "city"].unique().shape[0], df.loc[mask_russian, "city_"].unique().shape[0]

(4961, 3289)

In [34]:
r = [(c, g.shape[0]) for c, g in df[mask_russian].groupby("city_")]
r = sorted(r, key=itemgetter(1), reverse=True)

len(filter(lambda x: x[1] > 50, r))

791

In [35]:
df["city"] = df["city_"]
df.drop(labels="city_", inplace=True, axis=1)
df.head()

Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,is_train,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
0,2.884034,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5261,,59.844074,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177
1,2.775633,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5261,,59.844074,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177
2,3.708368,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.8582,30.229023,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177
3,2.787498,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5261,,59.844074,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177
4,2.89251,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,True,5261,,59.844074,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177


In [36]:
df_1 = df[df["is_train"] == True]
df_1.drop(labels="is_train", inplace=True, axis=1)
df_1.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
0,2.884034,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,11606fde0c814ce78e0d726e39a0a5ee,2017-07-15,59.847,30.177
1,2.775633,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,e9647a5e1eacfb06713b6af755ccc595,2017-10-27,59.847,30.177
2,3.708368,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5992,"PR.MARSHALA ZHUKOVA,31St Petersburg190000 7...",59.8582,30.229023,df06c1fcd3718a514535ae822785f716,2017-10-03,59.847,30.177
3,2.787498,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,6c5e5793ebc984fb72875feffff62854,2017-09-09,59.847,30.177
4,2.89251,,,,ST PETERBURG,RUS,643.0,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,,59.844074,30.179153,0576445d74e374c92c0902e612fca356,2017-07-06,59.847,30.177


In [37]:
df_2 = df[df["is_train"] == False]
df_2.drop(labels="is_train", inplace=True, axis=1)
df_2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,amount,atm_address,atm_address_lat,atm_address_lon,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,pos_address,pos_address_lat,pos_address_lon,terminal_id,transaction_date,work_add_lat,work_add_lon
1224734,2.211818,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,,,5411,2-1 TOLMACHEVA STRIVANTEEVKA141280 RUSRUS,55.967487,37.913681,ff0476dae4b098a7b16aabe93d4268df,2017-08-24,,
1224735,1.331379,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,,,4111,"SOVETSKIJ, 32IVANTEEVKA141282 RUSRUS",55.971294,37.905186,7cfd9a60282459d4692ecc85b856072e,2017-08-12,,
1224736,2.608004,,,,PUSHKINO,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,,,5411,"105,KRASOARMEYSKOE SHPUSHKINO141206 RUSRUS",56.01659,37.9091,7e5a532f0029861d8a9c4f0479b9450b,2017-06-17,,
1224737,1.916752,,,,IVANTEEVKA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,,,5411,"G. IVANTEEVKA, UL.TOLMACHEVA, D.6IVANTEEVKA141...",55.964508,37.937912,2afe7d1bc61b86c449f413bdf2119032,2017-08-12,,
1224738,1.981067,,,,MOSKVA,RUS,643.0,00fd410f5c580c8351cafa88d82b60f3,,,5814,5 KOMSOMOLSKAYA SQMOSCOW101000 RUSRUS,55.776802,37.657352,ab4f00601ff1d949afc59ee3f804c79c,2017-04-26,,


In [38]:
df_1.to_csv('data/sets/train_set_mod_cities.csv', sep=',', encoding='utf-8', index=False)
df_2.to_csv('data/sets/test_set_mod_cities.csv', sep=',', encoding='utf-8', index=False)

In [39]:
for c, g in r:
    print "'{}'\t{}".format(c, g)

'MOSKVA'	906709
'ST PETERBURG'	494609
'NOVOSIBIRSK'	49620
'EKATERINBURG'	48937
'NIZHNIJ NOVGOROD'	47678
'CHEREPOVETS'	45311
'KRASNOJARSK'	35709
'KRASNODAR'	33934
'SAMARA'	24361
'KAZAN'	22377
'KHIMKI'	20394
'SOCHI'	18984
'PODOLSK'	18618
'NOVOROSSIJSK'	17813
'UFA'	15068
'VORONEZH'	15059
'OREL'	14778
'SYKTYVKAR'	14695
'CHELJABINSK'	14635
'ROSTOV NA DONU'	14432
'MYTISHCHI'	14385
'OMSK'	14227
'JAROSLAVL'	13659
'PETROZAVODSK'	13550
'KALUGA'	10911
'VOLGOGRAD'	10820
'BALASHIKHA'	10344
'ODINTSOVO'	10304
'ANAPA'	10218
'PERM'	10094
'KRASNOGORSK'	9676
'TULA'	8823
'KIROV'	8795
'IRKUTSK'	8769
'DOMODEDOVO'	8589
'SARATOV'	8471
'KALININGRAD'	8324
'LJUBERTSY'	8128
'TJUMEN'	8113
'LIPETSK'	8017
'KEMEROVO'	7903
'SURGUT'	7688
'BRJANSK'	7635
'KOTELNIKI'	6739
'KOROLEV'	6667
'SARANSK'	6605
'VSEVOLOZHSK'	5858
'KURSK'	5596
'PUSHKINO'	5467
'SMOLENSK'	5392
'TOLJATTI'	5287
'BELGOROD'	5067
'REUTOV'	4770
'ZHELEZNODOROZHNYJ'	4766
'TOMSK'	4757
'ZHELEZNOGORSK'	4691
'LOBNJA'	4377
'KOLPINO'	3982
'KOLOMNA'	3744
'RAMENSKOE'