In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt, matplotlib.image as mpimg
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn import ensemble
from xgboost.sklearn import XGBClassifier
from sklearn.utils import shuffle
from datetime import date
import math
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
from datetime import date

In [36]:
def check_if_atm(row):
    if math.isnan(row['atm_address_lat']):
        return 0
    else:
        return 1

def generate_features(data):    
    data['transaction_date'] = pd.to_datetime(data['transaction_date'])
    data['currency'] = data['currency'].fillna(-1).astype(np.int32)
    data['city'] = data['city'].apply(lambda x: str(x).lower())
    data['city'] = data['city'].factorize()[0].astype(np.int32)
    data['country'] = data['country'].factorize()[0].astype(np.int32)
    data['c_country'] = data['country'].apply(lambda x: 1 if x == 0 else 0)
    data['c_curr'] = data['currency'].apply(lambda x: 1 if x == 643 else 0)
    data['weekday'] = data['transaction_date'].dt.weekday.astype(np.int32)
    data['is_holiday'] = data['transaction_date'].apply(lambda x : date.weekday(x) > 4)
    data['is_holiday'] = data['is_holiday'].astype(float)
     
    data['is_atm'] = data.apply(lambda row: check_if_atm(row), axis=1)

    data['address_lat'] = data['atm_address_lat'].fillna(data['pos_adress_lat'])
    data['address_lon'] = data['atm_address_lon'].fillna(data['pos_adress_lon'])
    
    data['address'] = data['address_lat'].apply(lambda x: "%.02f" % x) + ';' + data['address_lon'].apply(lambda x: "%.02f" % x)
    data['address'] = data['address'].factorize()[0].astype(np.int32)
    
    data['count_of_transactions']=data.groupby(['customer_id'])["address_lat"].transform("count")
    data['transactions']=data.groupby(['customer_id','address'])["address_lat"].transform("count")

    data['pers'] = data['transactions'] / data['count_of_transactions']

    return data

def generate_target(data):
    lat = data['home_add_lat'] - data['adress_lat']
    lon = data['home_add_lon'] - data['adress_lon']
    data['is_home'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
    #data['has_home'] = (~data['home_add_lon'].isnull()).astype(np.int32)

    lat = data['work_add_lat'] - data['adress_lat']
    lon = data['work_add_lon'] - data['adress_lon']
    data['is_work'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
    #data['has_work'] = (~data['work_add_lon'].isnull()).astype(np.int32)


In [14]:
test_data = pd.read_csv('data/test_set.csv')
data = pd.read_csv('data/train_set.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
data = data.dropna(subset=['terminal_id', 'transaction_date'])
data = generate_features(data)
#data.drop(data[((data['address_lon'] == 0) & (data['address_lon'] == 0))].index, axis = 0, inplace = True)
data = data[data['mcc'] != 5541]
data = generate_target(data)

test_data['mcc'].apply(lambda x: int(x.replace(',', '')))
test_data = generate_features(test_data)

In [5]:
columns = ['amount', 'currency', 'city', 'country','mcc', 'c_mcc', 'is_atm', 'weekday', 'pers',  'address_lat', 'address_lon', 'address']

model_home = LGBMClassifier( n_jobs=-1)
model_work = LGBMClassifier(learning_rate = 0.05, num_iterations=560, jobs=-1)
model_home.fit(train[columns], train['is_home'])
model_work.fit(train[columns], train['is_work'])

In [None]:
test_data['is_work'] = model_work.predict(test_data[columns])
test_data['is_home'] = model_home.predict(test_data[columns])

sub_data = pd.read_csv('sample.csv')
sub_data['home_add_lat'] = sub_data['home_add_lat'].astype(float)
sub_data['home_add_lon'] = sub_data['home_add_lon'].astype(float)
sub_data['work_add_lat'] = sub_data['work_add_lat'].astype(float)
sub_data['work_add_lon'] = sub_data['work_add_lon'].astype(float)
for i, row in sub_data.iterrows():
    res = test_data.loc[test_data['customer_id'] == row['customer_id']]
    work_rows = res[res['is_work'] == 1]
    if work_rows.size > 1:
        lat = work_rows.loc[work_rows['count'].idxmax()]['adress_lat']
        lon = work_rows.loc[work_rows['count'].idxmax()]['adress_lon']
    elif work_rows.size == 1:
        lat = work_rows['adress_lat'][0]
        lon = work_rows['adress_lon'][0]
    else:
        work_rows_0 = res[(res['is_work'] == 0) & (res['is_home'] == 0)]
        if work_rows_0.size > 0:
            lat = work_rows_0.loc[work_rows_0['count'].idxmax()]['adress_lat']
            lon = work_rows_0.loc[work_rows_0['count'].idxmax()]['adress_lon']
        else:
            lat = 0
            lon = 0
    sub_data.set_value(i,'work_add_lat', lat)
    sub_data.set_value(i,'work_add_lon', lon)
    home_rows = res[res['is_home'] == 1]
    if home_rows.size > 1:
        lat = home_rows.loc[home_rows['count'].idxmax()]['adress_lat']
        lon = home_rows.loc[home_rows['count'].idxmax()]['adress_lon']
    elif home_rows.size == 1:
        lat = home_rows['adress_lat'][0]
        lon = home_rows['adress_lon'][0]
    else:
        home_rows_0 = res[(res['is_work'] == 0) & (res['is_home'] == 0)]
        if home_rows_0.size > 0:
            lat = home_rows_0.loc[home_rows_0['count'].idxmax()]['adress_lat']
            lon = home_rows_0.loc[home_rows_0['count'].idxmax()]['adress_lon']
        else:
            lat = 0
            lon = 0
    sub_data.set_value(i,'home_add_lat', lat)
    
    sub_data.set_value(i,'home_add_lon', lon)   

prediction = pd.DataFrame(sub_data).to_csv('prediction.csv')