In [1]:
import time
import numpy as np
import pandas as pd

from scipy import sparse
from scipy.stats import skew, boxcox
from datetime import datetime
from sklearn import preprocessing
# from scipy.stats import skew, boxcox,boxcox_normmax
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss
import xgbfir
from gplearn.genetic import SymbolicTransformer
from gplearn.functions import make_function
from sklearn.decomposition import PCA
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb

seed = 1234

In [2]:
def cv_train(train,y):
    xgtrain = xgb.DMatrix(train, label=y)
    params = dict()
    params['objective']='multi:softprob'
    params['eval_metric']='mlogloss',
    params['num_class']=3
    params['silent']=0
    params['eta'] = 0.3
    params['verbose_eval'] = False
    params['max_depth'] = 10


    cv_result = xgb.cv(
        params, xgtrain, 
        num_boost_round=10000, nfold=4,
        metrics = 'mlogloss',
        seed=seed,callbacks=[xgb.callback.early_stop(50)]
    )

    return cv_result['test-mlogloss-mean'].values[-1]


def FI(train,y,FIname = '../FE/FI.xlsx'):
    X_train, X_val, y_train, y_val = train_test_split(train, y, train_size=.75, random_state=1234)
#     print X_train.shape
#     print X_val.shape

    rgr = xgb.XGBClassifier(
            objective='multi:softprob',
            seed = seed, # use a fixed seed during tuning so we can reproduce the results
            learning_rate = 0.3,
            n_estimators = 100000,
            max_depth= 10,
            nthread = -1,
            silent = False
        )
    rgr.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        eval_metric='mlogloss',
        early_stopping_rounds=50,
        verbose=False
    )
    
    xgbfir.saveXgbFI(rgr, feature_names=X_train.columns, OutputXlsxFile = FIname)

    return rgr.best_score

# Load Data

In [3]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
sub_id = test_X.listing_id.astype('int32').values
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 322) (74659, 322) (49352L,)


In [4]:
f = [x for x in train_X.columns.values if 'feature_' not in x]
# f.append('feature_upper_perc')

In [5]:
train_X = train_X[f]
test_X = test_X[f]
full_data = pd.concat([train_X,test_X],ignore_index=True)
print train_X.shape, test_X.shape, full_data.shape

(49352, 60) (74659, 60) (124011, 60)


In [6]:
print cv_train(train_X,train_y)

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[24]	train-mlogloss:0.273417+0.00274805	test-mlogloss:0.562661+0.00684593

0.5626605


In [7]:
print FI(train_X,train_y,FIname = '../FE/FI.xlsx')

0.563232


In [8]:

def Feature_2_interction(f1,f2):
    pca = PCA(n_components=1)
    pca.fit(full_data[[f1,f2]])
    train_X[f1+'+'+f2] = pca.transform(train_X[[f1,f2]])
    test_X[f1+'+'+f2] = pca.transform(test_X[[f1,f2]])
#     return 
# full_data.shape

def Feature_3_interction(f1,f2,f3):
    pca = PCA(n_components=2)
    pca.fit(full_data[[f1,f2,f3]])
    tmp = pca.transform(train_X[[f1,f2,f3]])
    train_X[f1+'+'+f2 +'+'+f3+'0'] = tmp[:,0]
    train_X[f1+'+'+f2 +'+'+f3+'1'] = tmp[:,1]
    tmp = pca.transform(test_X[[f1,f2,f3]])
    test_X[f1+'+'+f2+'+'+f3+'0'] = tmp[:,0]
    test_X[f1+'+'+f2+'+'+f3+'1'] = tmp[:,1]
#     return tmp

In [9]:
top_2_features = ['manager_id_mean_med','ratio_bed','manager_id_mean_high','building_id_mean_med','building_id_mean_high',
                'price','longitude','listing_id','latitude'
                ,'desc_wordcount','display_address','street_address','num_features','pricePerRoom'
               ]
for x,y in itertools.combinations(top_2_features,2):
#     print x,y
    Feature_2_interction(x,y)


# top_3_features = ['manager_id_mean_med','ratio_bed','manager_id_mean_high','building_id_mean_med','building_id_mean_high',
#                 'price'
# #                 ,'desc_wordcount','display_address','street_address','num_features','pricePerRoom'
#                ]
# for x,y,z in itertools.combinations(top_3_features,3):
#     Feature_3_interction(x,y,z)

In [11]:
# pca = PCA(n_components=1)
# train_X['manager_id_mean_med|ratio_bed'] = 
train_X.head()

Unnamed: 0,listing_id,bathrooms,bedrooms,building_id,display_address,latitude,longitude,manager_id,price,street_address,...,desc_wordcount+display_address,desc_wordcount+street_address,desc_wordcount+num_features,desc_wordcount+pricePerRoom,display_address+street_address,display_address+num_features,display_address+pricePerRoom,street_address+num_features,street_address+pricePerRoom,num_features+pricePerRoom
0,7170325,1.0,1,3387,281,40.7108,-73.9539,2104,2400.0,2947,...,3164.879753,-8634.52426,-48.835386,-350.062888,-8649.032626,3164.701301,-367.947029,-8634.476915,8626.706053,-350.080746
1,7092344,1.0,2,4758,3434,40.7513,-73.9722,1964,3800.0,8215,...,10.748026,-3366.233889,225.164206,-283.496297,-3366.495016,11.701334,-283.475642,-3366.476916,3360.78209,-283.41408
2,7158677,1.0,2,5289,3457,40.7575,-73.9625,2627,3495.0,15314,...,-12.13323,3732.73206,197.164223,-385.152731,3732.535396,-11.298666,-385.010801,3732.523084,-3738.894069,-385.080747
3,7211212,1.5,3,2104,4018,40.7145,-73.9425,1204,3000.0,21701,...,-572.234931,10119.502466,-13.838151,-883.408969,10122.063634,-572.298455,-880.16808,10119.523076,-10133.882636,-883.414086
4,7225292,1.0,0,4930,3411,40.7439,-73.9743,610,2795.0,13511,...,35.792607,1929.245893,-257.836072,1245.013337,1929.341794,34.701381,1244.703434,1929.523082,-1906.623632,1244.919252


In [12]:
print cv_train(train_X,train_y)

Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[24]	train-mlogloss:0.266803+0.00361119	test-mlogloss:0.562586+0.00846737

0.562586


In [13]:
print FI(train_X,train_y,FIname = '../FE/FI1.xlsx')

0.563578
