# The script cleans up and normalizes the addresses and contact information

## 1. Load and clean up data

In [12]:
import pandas as pd
import numpy as np
import re
import funcs as normex
import random
import imp

imp.reload(normex)

pd.set_option('display.precision', 15)


def some(x, n):
    return x.ix[random.sample(x.index.tolist(), n)]


addr_lookup = pd.read_excel('sample.xlsx', sheetname='cat', header=1, encoding='utf-8')
addr_lookup = addr_lookup.reset_index(drop=True).dropna(axis=1, how='all').dropna(axis=0, how='all')
addr_lookup['Улица'] = addr_lookup.Улица.dropna()


def normalize_regex(df, rus_addr_list):
    rus_addr = pd.read_csv(rus_addr_list, header=None, names=['Адрес'], index_col=0, encoding='Windows-1251')
    df['rus_addr'] = rus_addr.Адрес
    #     con = pd.concat([df, rus_addr], axis=1)
    df.rus_addr.replace('\n', ' ', inplace=True)
    df.rus_addr.replace(to_replace='(П|п)роспект', value='просп.', regex=True, inplace=True)
    df.rus_addr.replace(to_replace='(П|п)(Р|р)\.', value='просп. ', regex=True, inplace=True)    
    df.rus_addr.replace(to_replace='(П|п)-т', value='просп. ', regex=True, inplace=True)
    df.rus_addr.replace(to_replace='^(В|в)ул(\.|\s)', value='ул. ', regex=True, inplace=True)
    df.rus_addr.replace(to_replace='^(В|в)(\.|\s)', value='ул. ', regex=True, inplace=True)
    df.rus_addr.replace(to_replace='^(Н|н)(\.|\s)?(аб)?.*Перемог.', value='ул. Набережная Победы', regex=True, inplace=True)

    
def split_address(df):
    df['street'] = df.rus_addr.str.lower()
    df['street'], df['addr'] = zip(*df.street.map(normex.address_splitter))

    
def preprocess(file, sheetname=0, is_test_mode=True, header=None, swap_addr_and_tel=False):
    df = pd.read_excel(file, sheetname=sheetname, header=header, encoding='utf-8').reset_index(drop=True)
#     df = pd.read_csv(file, header=header).reset_index(drop=True)
    
    df.Адрес.to_csv('streets_mixed.csv', index=True, encoding='utf-8')
    
    df.dropna(axis=0, how='all', inplace=True)
    df.dropna(axis=1, how='all', inplace=True)
    # df.drop(df.columns[4:], axis=1, inplace=True)
    # df.dropna(axis=1, how='all', inplace=True)

    
    #swap address and tel if misplaced
    if swap_addr_and_tel:
        swap_idx = df.Адрес.str.contains(r'^[\d\s-]+$', na=True) #index for rows to swap address and tel 
        df.loc[swap_idx, ['Адрес', 'Телефон 1']] = df[swap_idx][['Телефон 1','Адрес']].values
        df.Адрес.fillna(value='<no address>', inplace=True)
        df[df.Адрес.str.contains('<no address>')]
   
    return df


# df.Адрес[~df.Адрес.isin(ukr_addr.Адрес)]
# len(ukr_addr), len(df), len(rus_addr)
    
# raw_data = preprocess('sample.xlsx', 'DB', rus_addr_list='streets_rus_for_data_preprocessed.txt', is_test_mode=True, header=2)

raw_data = preprocess('total db_20151010.xlsx', is_test_mode=False, header=0)
#!!!!! remove
# raw_data = some(raw_data, 2000)
normalize_regex(raw_data, rus_addr_list='data/total_streets_prod.txt')
split_address(raw_data)

In [3]:
raw_data.tail()

Unnamed: 0,Источник,ФИО,Адрес,тел. 1,тел. 2,rus_addr,street,addr
672683,ИЗБ,,"49131, Україна, Дніпропетровська обл., м.Дніпр...",,,"пров.Пушкина, 27","пров.пушкина,",27
672684,ИЗБ,,"49131, Україна, Дніпропетровська обл., м.Дніпр...",,,"пров.Пушкина, 3","пров.пушкина,",3
672685,ИЗБ,,"49131, Україна, Дніпропетровська обл., м.Дніпр...",,,"пров.Пушкина, 5","пров.пушкина,",5
672686,ИЗБ,,"49131, Україна, Дніпропетровська обл., м.Дніпр...",,,"пров.Пушкина, 7","пров.пушкина,",7
672687,ИЗБ,,"49131, Україна, Дніпропетровська обл., м.Дніпр...",,,"пров.Пушкина, 9","пров.пушкина,",9


In [11]:
import pandas as pd


addr = pd.read_csv('data/mixed_streets_production.txt', header=None, names=['Адрес'], index_col=0)
addr.Адрес.replace(to_replace='\d*\D*м.Днепропетровск,', value='', regex=True, inplace=True)
addr.Адрес.replace(to_replace='Наб\.', value='Набережная ', regex=True, inplace=True)
addr.Адрес.replace(to_replace='НаБерезоваяя', value='Набережная ', regex=True, inplace=True)
addr.Адрес.replace(to_replace='Б-Р ', value='бульв. ', regex=True, inplace=True)
addr.Адрес.replace(to_replace='З.КосмодемъЯнськое', value='Зои Космодемьянской', regex=True, inplace=True)
addr.Адрес.replace(to_replace='[Пп]роспект', value='просп.', regex=True, inplace=True)
addr.Адрес.replace(to_replace='[Зз]атишна', value='Уютная', regex=True, inplace=True)
addr.Адрес.replace(to_replace='[Пп]охил', value='Наклонн', regex=True, inplace=True)
addr.Адрес.replace(to_replace='рокив', value='лет', regex=True, inplace=True)
addr.Адрес.replace(to_replace='Днипрельстанивская', value='Днепрельстановская', regex=True, inplace=True)
addr.Адрес.replace(to_replace='Водиев', value='Водителей', regex=True, inplace=True)
addr.Адрес.replace(to_replace='Зовнишня', value='Внешняя', regex=True, inplace=True)
addr.Адрес.replace(to_replace='Криворизька', value='Криворожская', regex=True, inplace=True)
addr.Адрес.replace(to_replace='Металургийна', value='Металургическая', regex=True, inplace=True)
addr.Адрес.replace(to_replace='Повстання', value='Востания', regex=True, inplace=True)
addr.Адрес.replace(to_replace='Тиниста', value='Тенистая', regex=True, inplace=True)
addr.Адрес.replace(to_replace='[Пп]івнічна', value='Северная', regex=True, inplace=True)
addr.Адрес.replace(to_replace='[Пп]ивнична', value='Северная', regex=True, inplace=True)
addr.Адрес.replace(to_replace='[Рр]адянская', value='Советская', regex=True, inplace=True)
addr.Адрес.replace(to_replace='[Рр]адянська', value='Советская', regex=True, inplace=True)
addr.Адрес.replace(to_replace='[Дд]ивізи[ия]', value='Дивизии', regex=True, inplace=True)
addr.Адрес.replace(to_replace='Стрілецька', value='Стрелецкая', regex=True, inplace=True)
 

addr.to_csv('data/total_streets_prod.txt')

## 3. Load address book, generate n-grams

In [4]:
import time

start = time.clock()

addr_book = pd.read_excel('address_book.xlsx', header=1, encoding='utf-8')
addr_book['street_lower'] = addr_book.Улица.str.lower()


print('time: ' + str(time.clock() - start))

time: 29.669086932871714


In [5]:
import ngram
from collections import defaultdict

def default():
    return '<not found>'

def street_ngrams(addr_row):        
    addr_list = addr_row.Дом.map(str) +  '$' + addr_row.Квартира.map(str)    
    build_and_apt = [str(x) for x in addr_list.str.lower().tolist()]
    return ngram.NGram(build_and_apt, N=3)


# returns maping from lower case street names to numbers ngrams
def street_to_numbers_ngrams(addr_book):
    str_to_addr_ngram = defaultdict(default)
    
    unique_streets = addr_book.street_lower.unique()
    for st in unique_streets:        
        addr_row = addr_book[addr_book.street_lower == st]
        st_ngr = street_ngrams(addr_row)
        str_to_addr_ngram[st] = st_ngr    
    return str_to_addr_ngram


## street to number ngrams
start = time.clock()
str_ngram_dict = street_to_numbers_ngrams(addr_book)
print('street number ngrams time: ' + str(time.clock() - start))



street number ngrams time: 81.41941792163007


In [6]:
## street name ngrams
start = time.clock()
street_choices = [str(x) for x in addr_book.street_lower.unique()]
streets_G2 = ngram.NGram(street_choices, N=2)
print('street name ngrams time: ' + str(time.clock() - start))


street name ngrams time: 0.08000819346847265


In [13]:
def correct_street(x, ngr):
    if 'error' in str(x):
        return ('<error>', 0)
    return ngr.search(x)[0]


def match_streets(x, streets_G2):
    start = time.clock()
    x['street'], x['str_score'] = zip(*x.street.apply(correct_street, args=(streets_G2,)))
    print('match streets time: ' + str(time.clock() - start))
    

def correct_number(addr_row, ngr_dict):
    # 6 - 'street' 
    # 7 - 'addr' 
    street, addr = addr_row[6], addr_row[7]
    ngr = ngr_dict[street]
    
    if pd.isnull(addr) or 'error' in addr:
        return ('<error>', '<error>', 0)
    
    result = ngr.search(addr)
    
    if not result:
        return ('<error>', '<error>', 0)
    
    number, score = result[0]
    build, apt = number.split('$') 
    return (build, apt, score)


def match_numbers(x, str_ngram_dict):    
    start = time.clock()
    x['build'], x['apt'], x['num_score'] = zip(*df.apply(correct_number, args=(str_ngram_dict,), axis=1, raw=True))
    print('match numbers time: ' + str(time.clock() - start))
    
    
def correct_tel(x):    
    start = time.clock()
    x['тел. 1'], x['тел. 2'] = x['тел. 1'].astype(object), x['тел. 2'].astype(object)
    phones = x.apply(lambda x: normex.parse_tel(x['тел. 1'], x['тел. 2']), axis=1)
    x['tel_1'], x['tel_2'], x['tel_3'] = zip(*phones)
    print('correct tel time: ' + str(time.clock() - start))
    

groups = raw_data.groupby(by='Источник', sort=False, group_keys=True)
lower_to_upper_str = addr_book.groupby(by=['street_lower', 'Улица'])

group_files = list()
for x in groups.groups:
    g = groups.get_group(x)
    f_name = 'data/%s.csv' % g.Источник.unique()[0]
    g.to_csv(f_name)
    group_files.append(f_name)
    
print('Saved groups in separate files')
print('============================================')


out_files = list()    
for f in group_files:
    if 'Укропчик' in f:
        continue
    df = pd.read_csv(f, encoding='Windows-1251', header=0, index_col=0)
    print('Loaded file ' + f)
    match_streets(df, streets_G2)
    match_numbers(df, str_ngram_dict)   
    df.street.replace(dict(lower_to_upper_str.groups.keys()), inplace=True)
    correct_tel(df)
    print('Finished processing: ' + f)
    f_name = 'out/' + f
    df.to_csv(f_name)
    out_files.append(f_name)
    print('Saved to: ' + f_name)
    print('============================================')

ukropchik = pd.read_csv('data/Укропчик.csv', encoding='Windows-1251', header=0, index_col=0)    
correct_tel(ukropchik)
ukropchik_out_file = 'out/data/Укропчик.csv'
ukropchik.to_csv(ukropchik_out_file)
out_files.append(ukropchik_out_file)

out_dfs = list()
for f in out_files:
    partial = pd.read_csv(f, encoding='Windows-1251', header=0, index_col=0)
    out_dfs.append(partial)
    
out = pd.concat(out_dfs)

index = ['Источник', 'ФИО', 'Адрес', 'тел. 1', 'тел. 2',
         'rus_addr', 'addr', 'street', 'build', 'apt', 'tel_1', 'tel_2', 'tel_3',
         'num_score', 'str_score']
out = out.reindex_axis(index, axis=1)
out.to_csv('out/data/final.csv')    


# df_arr = np.array_split(df, 7)
# for i, x in enumerate(df_arr):
#     fname = 'data/mixed_streets_production%i.txt' % i
#     x.to_csv(fname, encoding='utf-8')

Saved groups in separate files
Loaded file data/oldmen АНД.csv
match streets time: 652.0116068435132
match numbers time: 120.14668404001304
correct tel time: 5.007285173756372
Finished processing: data/oldmen АНД.csv
Saved to: out/data/oldmen АНД.csv
Loaded file data/oldmen Кировский.csv
match streets time: 257.767996551559
match numbers time: 140.4346801810425
correct tel time: 3.9461270894198606
Finished processing: data/oldmen Кировский.csv
Saved to: out/data/oldmen Кировский.csv
Loaded file data/Инвалиды.csv
match streets time: 2.9716367436867586
match numbers time: 2.1560932721185964
correct tel time: 0.11990944421177119
Finished processing: data/Инвалиды.csv
Saved to: out/data/Инвалиды.csv
Loaded file data/oldmen Ленинский.csv
match streets time: 689.234300698583
match numbers time: 107.03327121235907
correct tel time: 7.078668415975699
Finished processing: data/oldmen Ленинский.csv
Saved to: out/data/oldmen Ленинский.csv
Loaded file data/oldmen Бабушкинский.csv
match streets tim

TypeError: Can't convert 'float' object to str implicitly

## Collect outputs to single file

In [None]:
import os
import pandas as pd

out_dir = 'out/data/'
out_dfs = list()
for f in os.listdir(out_dir):
    partial = pd.read_csv(out_dir + f, encoding='Windows-1251', header=0, index_col=0)
    out_dfs.append(partial)
    
out = pd.concat(out_dfs)

index = ['Источник', 'ФИО', 'Адрес', 'тел. 1', 'тел. 2',
         'rus_addr', 'addr', 'street', 'build', 'apt', 'tel_1', 'tel_2', 'tel_3',
         'num_score', 'str_score']
out = out.reindex_axis(index, axis=1)
# out.to_csv('out/data/final_test.csv')



## Translate street names (don't abuse, might be Google-banned )

In [None]:
import goslate
from urllib import request


# proxy_handler = request.ProxyHandler({"http" : "http://176.31.119.64:8080"})
# proxy_opener = request.build_opener(request.HTTPHandler(proxy_handler), request.HTTPSHandler(proxy_handler))
# gs = goslate.Goslate(opener=proxy_opener)

with open('mixed_streets_production.txt', 'r', encoding='utf-8') as f:
     novel_text = f.read()
# gs = goslate.Goslate()
# streets = [str(x) for x in raw_data.Адрес.tolist()]
gs = goslate.Goslate()
translation = gs.translate(novel_text, 'ru', source_language='uk')
translation = list(streets_rus)

In [None]:
from pyspark import  SparkContext, SQLContext
sc = SparkContext( 'local', 'pyspark')

## Lookup and match street names

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import time
import ngram

street_choices = [str(x) for x in addr_lookup.Улица.tolist()]
G2 = ngram.NGram(street_choices, N=2)

def match_scores(st):
    return [(fuzz.token_set_ratio(st, normalized_st), normalized_st, st) for normalized_st in street_choices]

def reducer(score1, score2):  
    print(score1, score2)    
    return max(score1, score2, key=lambda x: x[0])
#     scores = [(fuzz.token_set_ratio(raw, normal), normal) for normal in street_choices]
#     return max(scores, key=lambda x: x[0])[1]


start = time.clock()

# dirty_streets = sc.parallelize(raw_data.street.head(10).str.encode(encoding='utf-8').tolist())
# r = dirty_streets.flatMap(match_scores).reduceByKey(reducer).collect()

print('time: ' + str(time.clock() - start))


# streets_dictionary = prepare_match_list(street_choices)




start = time.clock()
matched_streets = list()
# for raw in raw_data.street.tolist():    
#     scores = [(fuzz.token_set_ratio(raw, normal), normal) for normal in G.search(raw, threshold=0.3)]
#     if (scores):
#         matched_streets.append(max(scores, key=lambda x: x[0])[1])
#     match = process.extractOne(raw, street_choices)
#     matched_streets.append(match[0])
    
# print(matched_streets)
print('time ngram on test data: ' + str(time.clock() - start))


# pd.isnull(addr_lookup['Unnamed: 1'])
# addr_lookup['Unnamed: 1'].iloc[1]

# df = raw_data.dropna(how='all')
# # df = raw_data.drop(raw_data.index[1])
# df = df[df.Адрес.str.contains(u'Правд')==True]
# df



choices = ['просп. Воронцова', 'просп. Гагарина', 'просп. Газеты "Правда"', 'просп. Героев', 'просп. Ильича', 
           'просп. Карла Маркса', 'просп. Кирова', 'просп. Металлургов', 'просп. Мира', 'просп. Олимпийский',
           'просп. Петровского', 'просп. Пушкина', 'просп. Свободы', 'просп. Сергея Нигояна', 'просп. Труда',
           'ул. Правды', 'ул. Юдина', 'ул. Набережная',  'ул. Набережная В.И.Ленина', 'пер. Пролетарской Победы', 
           'ул. Набережная Заводская', 'ул. Набережная им. Ленина', 'ул. Набережная Победы', 
           'ул. Мира', 'ул. Железнодорожная', 'ул. Дорожная', 'пер. Пролетарской Победы']    

# process.extract(u'ул. Железнодорожна', choices)
# fuzz.partial_token_sort_ratio(u'Наб. Победы', 'ул. Набережная Победы')
# fuzz.partial_ratio(u'Наб. Победы', 'ул. Набережная Победы')


# sorted([(fuzz.QRatio('ул.Малиновського,', normal), normal) for normal in street_choices], reverse=True)

start = time.clock()
[fuzz.token_set_ratio('ул.Малиновського,', normal) for normal in street_choices]
print('time: ' + str(time.clock() - start))

start = time.clock()
[G2.find(x) for x in raw_data.street.tolist()]
print('time bi-gram: ' + str(time.clock() - start))

In [None]:
G2.search('Наб. Перемоги')[0]

df = raw_data.head(10).copy()
df['street'], df['score'] = zip(*[G2.search(x)[0] for x in df.street.tolist()])
# [(x, G2.search(x)[0]) for x in df.street.tolist()]
# list(zip(*[G2.search(x)[0]  for x in df.street.tolist()]))
df

## String matching heuristics

In [None]:
import random

def some(x, n):
    return x.ix[random.sample(x.index, n)]

db = pd.read_excel('total db_20151010.xlsx', header=0, encoding='utf-8')

In [None]:
import random


def some(x, n):
    return x.ix[random.sample(x.index.tolist(), n)]