In [1]:
import time
import numpy as np
import pandas as pd

from scipy import sparse
from scipy.stats.mstats import gmean
from datetime import datetime
from sklearn import preprocessing
from scipy.stats import skew, boxcox,boxcox_normmax
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb

seed = 1234



In [2]:
train_df = pd.read_csv('../input/' + 'train_BrandenMurray.csv')
test_df = pd.read_csv('../input/' + 'test_BrandenMurray.csv')

In [3]:
train_df.head()

Unnamed: 0,listing_id,bathrooms,bedrooms,building_id,display_address,latitude,longitude,manager_id,price,street_address,...,manager_id_mean_med,manager_id_mean_high,desc_wordcount,pricePerBed,pricePerBath,pricePerRoom,bedPerBath,bedBathDiff,bedBathSum,bedsPerc
0,7170325,1.0,1,3387,281,40.7108,-73.9539,2104,2400.0,2947,...,0.403788,0.0594723,553,2400.0,2400.0,1200.0,2400.0,0.0,2.0,0.5
1,7092344,1.0,2,4758,3434,40.7513,-73.9722,1964,3800.0,8215,...,0.650389,0.0500421,827,1900.0,3800.0,1266.666667,3800.0,1.0,3.0,0.666667
2,7158677,1.0,2,5289,3457,40.7575,-73.9625,2627,3495.0,15314,...,0.194666,0.04161561,799,1747.5,3495.0,1165.0,3495.0,1.0,3.0,0.666667
3,7211212,1.5,3,2104,4018,40.7145,-73.9425,1204,3000.0,21701,...,0.248812,6.214142e-31,588,1000.0,2000.0,666.666667,2000.0,1.5,4.5,0.666667
4,7225292,1.0,0,4930,3411,40.7439,-73.9743,610,2795.0,13511,...,0.140153,2.440795e-18,344,-1.0,2795.0,2795.0,2795.0,-1.0,1.0,0.0


In [4]:
train_y = np.ravel(pd.read_csv('../input/' + 'labels_BrandenMurray.csv'))

In [91]:
train_y_df = pd.DataFrame(train_y, columns = ['interest_level'])
features_to_use = ['latitude','longitude','listing_id']
# features_to_use.append('manager_id')
tmp_train = pd.concat([train_df[features_to_use],train_y_df], axis = 1)
tmp_test = test_df[features_to_use]

In [92]:
tmp_test.head()

Unnamed: 0,latitude,longitude,listing_id
0,40.7185,-73.9865,7142618
1,40.7278,-74.0,7210040
2,40.726,-74.0026,7174566
3,40.7321,-74.0028,7191391
4,40.7054,-74.0095,7171695


In [93]:
tmp_train.head()

Unnamed: 0,latitude,longitude,listing_id,interest_level
0,40.7108,-73.9539,7170325,1
1,40.7513,-73.9722,7092344,0
2,40.7575,-73.9625,7158677,1
3,40.7145,-73.9425,7211212,1
4,40.7439,-73.9743,7225292,0


In [111]:
X_train, X_val, y_train, y_val = train_test_split(tmp_train, train_y, train_size=.80, random_state=1234)

In [112]:
X_train.head()

Unnamed: 0,latitude,longitude,listing_id,interest_level
40116,40.7373,-74.0042,6848609,0
774,40.77,-73.9491,7090131,0
1599,40.8573,-73.9353,7088808,2
44468,40.7222,-74.0119,6862626,0
42023,40.7656,-73.9886,6852462,1


In [113]:
X_val.head()

Unnamed: 0,latitude,longitude,listing_id,interest_level
25162,40.7928,-73.9659,6966955,1
14090,40.7634,-73.9596,7118331,0
38034,40.8644,-73.9201,6876692,0
5845,40.7666,-73.9874,7165769,0
7129,40.6567,-74.0043,7172375,0


In [171]:
from sklearn.metrics.pairwise import euclidean_distances

def dis_level(train, val, test):
    index_low = train[train['interest_level'] == 0].index
    index_medium = train[train['interest_level'] == 1].index
    index_high = train[train['interest_level'] == 2].index

    dis_low = ['dis_low_1','dis_low_2','dis_low_4','dis_low_8',
               'dis_low_16','dis_low_32','dis_low_64','dis_low_128']
    for i in range(len(dis_low)):
        train.loc[:,dis_low[i]] = np.nan
        val.loc[:,dis_low[i]] = np.nan
        test.loc[:,dis_low[i]] = np.nan


    dis_medium = ['dis_medium_1','dis_medium_2','dis_medium_4','dis_medium_8',
                  'dis_medium_16','dis_medium_32','dis_medium_64','dis_medium_128']
    for i in range(len(dis_medium)):
        train.loc[:,dis_medium[i]] = np.nan
        val.loc[:,dis_medium[i]] = np.nan
        test.loc[:,dis_medium[i]] = np.nan

    dis_high = ['dis_high_1','dis_high_2','dis_high_4','dis_high_8',
                'dis_high_16','dis_high_32','dis_high_64','dis_high_128']
    for i in range(len(dis_high)):
        train.loc[:,dis_high[i]] = np.nan
        val.loc[:,dis_high[i]] = np.nan
        test.loc[:,dis_high[i]] = np.nan

    euc_low = train.loc[index_low,['latitude','longitude']]*100
    euc_medium = train.loc[index_medium,['latitude','longitude']]*100
    euc_high = train.loc[index_high,['latitude','longitude']]*100

    temp_train = train.loc[:,['latitude','longitude']]*100
    temp_val = val.loc[:,['latitude','longitude']]*100
    temp_test = test.loc[:,['latitude','longitude']]*100    
    
    # dis_level for train
    kf = KFold(n_splits=200)
    for _, index in kf.split(temp_train):
        # dist to low
        tmp_dist = euclidean_distances(temp_train.iloc[index],euc_low)
        tmp_dist = pd.DataFrame(np.sort(tmp_dist,axis = 1))
        for dis_n in range(len(dis_low)):
            train.iloc[index, 4 + dis_n] = np.array(tmp_dist.loc[:,:np.power(2,dis_n)].sum(axis = 1))
    
        # dist to medium
        tmp_dist = euclidean_distances(temp_train.iloc[index],euc_medium)
        tmp_dist = pd.DataFrame(np.sort(tmp_dist,axis = 1))
        for dis_n in range(len(dis_medium)):
            train.iloc[index,12 + dis_n] = np.array(tmp_dist.loc[:,:np.power(2,dis_n)].sum(axis = 1))
            
#         # dist to high
        tmp_dist = euclidean_distances(temp_train.iloc[index],euc_high)
        tmp_dist = pd.DataFrame(np.sort(tmp_dist,axis = 1))
        for dis_n in range(len(dis_high)):
            train.iloc[index,20 + dis_n] = np.array(tmp_dist.loc[:,:np.power(2,dis_n)].sum(axis = 1))

    
    # dis_level for test
    kf = KFold(n_splits=200)
    for _, index in kf.split(temp_test):
        # dist to low
        tmp_dist = euclidean_distances(temp_test.iloc[index],euc_low)
        tmp_dist = pd.DataFrame(np.sort(tmp_dist,axis = 1))
        for dis_n in range(len(dis_low)):
            test.iloc[index, 4 + dis_n] = np.array(tmp_dist.loc[:,:np.power(2,dis_n)].sum(axis = 1))
    
        # dist to medium
        tmp_dist = euclidean_distances(temp_test.iloc[index],euc_medium)
        tmp_dist = pd.DataFrame(np.sort(tmp_dist,axis = 1))
        for dis_n in range(len(dis_medium)):
            test.iloc[index,12 + dis_n] = np.array(tmp_dist.loc[:,:np.power(2,dis_n)].sum(axis = 1))

#         # dist to high
        tmp_dist = euclidean_distances(temp_test.iloc[index],euc_high)
        tmp_dist = pd.DataFrame(np.sort(tmp_dist,axis = 1))
        for dis_n in range(len(dis_high)):
            test.iloc[index,20 + dis_n] = np.array(tmp_dist.loc[:,:np.power(2,dis_n)].sum(axis = 1))


    # dis_level for val
    kf = KFold(n_splits=200)
    for _, index in kf.split(temp_val):
        # dist to low
        tmp_dist = euclidean_distances(temp_val.iloc[index],euc_low)
        tmp_dist = pd.DataFrame(np.sort(tmp_dist,axis = 1))
        for dis_n in range(len(dis_low)):
            val.iloc[index, 4 + dis_n] = np.array(tmp_dist.loc[:,:np.power(2,dis_n)].sum(axis = 1))
    
        # dist to medium
        tmp_dist = euclidean_distances(temp_val.iloc[index],euc_medium)
        tmp_dist = pd.DataFrame(np.sort(tmp_dist,axis = 1))
        for dis_n in range(len(dis_medium)):
            val.iloc[index,12 + dis_n] = np.array(tmp_dist.loc[:,:np.power(2,dis_n)].sum(axis = 1))

#         # dist to high
        tmp_dist = euclidean_distances(temp_val.iloc[index],euc_high)
        tmp_dist = pd.DataFrame(np.sort(tmp_dist,axis = 1))
        for dis_n in range(len(dis_high)):
            val.iloc[index,20 + dis_n] = np.array(tmp_dist.loc[:,:np.power(2,dis_n)].sum(axis = 1))

    return train, val, test        

In [172]:
X_train_dis, X_val_dis, X_test_dis, tmp_dist_out, tmp_index = dis_level(X_train,X_val,tmp_test)

KeyboardInterrupt: 

In [173]:
X_train.shape

(39481, 28)

In [174]:
print X_train_dis.iloc[:,12:].describe()

       dis_medium_1  dis_medium_2  dis_medium_4  dis_medium_8  dis_medium_16  \
count  22736.000000  22736.000000  22736.000000  22736.000000   22736.000000   
mean       2.915196      5.713286     11.370000     22.861140      46.360739   
std      148.098660    295.878445    591.509245   1182.941466    2366.344534   
min        0.000000      0.000000      0.000000      0.000000       0.000000   
25%        0.000000      0.000000      0.000000      0.079995       0.593927   
50%        0.000000      0.028284      0.124073      0.412196       1.253157   
75%        0.070711      0.144222      0.344596      0.900135       2.526779   
max     8418.916360  16837.869097  33675.774572  67359.200202  134755.808005   

       dis_medium_32  dis_medium_64  dis_medium_128    dis_high_1  \
count   22736.000000   22736.000000    2.273600e+04  22736.000000   
mean       94.913310     196.740040    4.135935e+02      5.644383   
std      4734.440693    9471.773153    1.894871e+04    295.741031   
min