In [1]:
import time
import numpy as np
import pandas as pd

from scipy import sparse
from scipy.stats.mstats import gmean
from datetime import datetime
from sklearn import preprocessing
from scipy.stats import skew, boxcox,boxcox_normmax
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from __future__ import print_function

import tensorflow as tf

from tqdm import tqdm

seed = 1234

# Load Data

In [2]:
data_path = "../../input/"
train_X = pd.read_csv(data_path + 'train_BM_MB_add03052240_desc.csv')
test_X = pd.read_csv(data_path + 'test_BM_MB_add03052240_desc.csv')
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))

# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print (train_X.shape, test_X.shape, train_y.shape)

(49352, 522) (74659, 522) (49352,)


In [17]:
scaler = preprocessing.StandardScaler()
scaler.fit(train_X)
train_X_trans = scaler.transform(train_X)
test_X_trans = scaler.transform(test_X)

# y_low =[]
# for i in range(train_X_trans.shape[0]):
#     y_low.append(1 if train_y[i] == 2 else 0)
    
# y_low = np.array(y_low)  

In [143]:
def Multi_Class_FM(train_x, train_y_binary, test_x, fold = 5):
    skf = KFold(n_splits=fold,random_state=seed)
    dict_opochs = {0:300,1:500,2:650}
#     dict_opochs = {0:40,1:40,2:40}
    label = set(train_y_binary)
    train_blend_x = np.zeros((train_x.shape[0], 2*len(label)))    
    test_blend_x_mean = np.zeros((test_x.shape[0], 2*len(label)))
    test_blend_x_gmean = np.zeros((test_x.shape[0], 2*len(label)))
    
    for j, (level) in enumerate(label):
        vfunc = np.vectorize(lambda x: 1 if x == level else 0)
        y_ovr = vfunc(train_y_binary)
        test_blend_x_j = np.zeros((test_x.shape[0], 2*fold))
        
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            train_x_fold = train_x[train_index]
            train_y_fold = y_ovr[train_index]
            val_x_fold = train_x[val_index]
            val_y_fold = y_ovr[val_index]  

            model = TFFMClassifier(
                order=2, 
                rank=10, 
                optimizer=tf.train.AdamOptimizer(learning_rate=3e-3,epsilon = 1), 
                n_epochs=dict_opochs[level], 
                batch_size=1024,
                init_std=0.001,
                reg= 0.01,
                input_type='dense',
                seed=42
            )

            model.fit(train_x_fold, train_y_fold, show_progress=False)
            predictions = model.predict_proba(val_x_fold)
            scores = log_loss(val_y_fold, predictions)
            print('label:{0}\tlog_loss: {1:.12f}\tfold:{2}'.format(level, scores, i+1))
            train_blend_x[val_index,j*2:(j+1)*2] = predictions
            test_blend_x_j[:,(i*2):(i+1)*2] = model.predict_proba(test_x)
            # this will close tf.Session and free resources
            model.destroy()
        
        test_blend_x_mean[:,(j*2):(j+1)*2] = \
                np.stack([test_blend_x_j[:,range(0,2*fold,2)].mean(1),
                          test_blend_x_j[:,range(1,2*fold,2)].mean(1)]).T
        
        test_blend_x_gmean[:,(j*2):(j+1)*2] = \
                np.stack([gmean(test_blend_x_j[:,range(0,2*fold,2)], axis=1),
                          gmean(test_blend_x_j[:,range(1,2*fold,2)], axis=1)]).T            
    return train_blend_x, test_blend_x_mean, test_blend_x_gmean

In [144]:
(tmp,
 test_mean,
 test_gmean)= Multi_Class_FM(train_X_trans,train_y,test_X_trans, 10)

label:0	log_loss: 0.454955991229	fold:1
label:0	log_loss: 0.443051334661	fold:2
label:0	log_loss: 0.456654232249	fold:3
label:0	log_loss: 0.447933480535	fold:4
label:0	log_loss: 0.462754510770	fold:5
label:0	log_loss: 0.452832541395	fold:6
label:0	log_loss: 0.458048671167	fold:7
label:0	log_loss: 0.470953706975	fold:8
label:0	log_loss: 0.464838941301	fold:9
label:0	log_loss: 0.473910745417	fold:10
label:1	log_loss: 0.459752223000	fold:1
label:1	log_loss: 0.441396369827	fold:2


  return 1 / (1 + np.exp(-x))


label:1	log_loss: 0.461093828437	fold:3
label:1	log_loss: 0.453319550619	fold:4
label:1	log_loss: 0.461915840894	fold:5
label:1	log_loss: 0.455964289362	fold:6
label:1	log_loss: 0.461743115097	fold:7
label:1	log_loss: 0.472934473471	fold:8
label:1	log_loss: 0.464874394938	fold:9
label:1	log_loss: 0.469379874931	fold:10
label:2	log_loss: 0.205032050521	fold:1
label:2	log_loss: 0.211113717831	fold:2
label:2	log_loss: 0.221399331508	fold:3
label:2	log_loss: 0.201093951353	fold:4
label:2	log_loss: 0.229471282665	fold:5
label:2	log_loss: 0.207587482725	fold:6
label:2	log_loss: 0.215556345781	fold:7
label:2	log_loss: 0.221223705717	fold:8
label:2	log_loss: 0.219412965314	fold:9
label:2	log_loss: 0.203328123977	fold:10


In [145]:
tmp_data = pd.DataFrame(tmp, columns = ['low0','low1','med0','med1','high0','high1'])
tmp_data[:10]

Unnamed: 0,low0,low1,med0,med1,high0,high1
0,0.475411,0.524589,0.53011,0.469889,0.850573,0.149427
1,0.689422,0.310578,0.493679,0.506321,0.875653,0.124347
2,0.213016,0.786984,0.787976,0.212024,0.973409,0.026591
3,0.593406,0.406594,0.768885,0.231115,0.979773,0.020228
4,0.06521,0.93479,0.889295,0.110705,0.980686,0.019314
5,0.626392,0.373608,0.539808,0.460192,0.895199,0.104801
6,0.420211,0.579789,0.707093,0.292907,0.921662,0.078338
7,0.551261,0.448739,0.575914,0.424086,0.925037,0.074963
8,0.484554,0.515446,0.613156,0.386844,0.856716,0.143284
9,0.091716,0.908284,0.721913,0.278087,0.98942,0.01058


In [147]:
tmp_data['sum1'] = tmp_data['low1'] + tmp_data['med1']+ tmp_data['high1']
tmp_data['low1_1'] = tmp_data['low1'] / tmp_data['sum1']
tmp_data['med1_1'] = tmp_data['med1'] / tmp_data['sum1']
tmp_data['high1_1'] = tmp_data['high1'] / tmp_data['sum1']
tmp_data.head()

Unnamed: 0,low0,low1,med0,med1,high0,high1,sum1,low1_1,med1_1,high1_1
0,0.475411,0.524589,0.53011,0.469889,0.850573,0.149427,1.143906,0.458595,0.410776,0.130629
1,0.689422,0.310578,0.493679,0.506321,0.875653,0.124347,0.941245,0.329965,0.537926,0.132109
2,0.213016,0.786984,0.787976,0.212024,0.973409,0.026591,1.025599,0.767341,0.206732,0.025927
3,0.593406,0.406594,0.768885,0.231115,0.979773,0.020228,0.657937,0.617983,0.351273,0.030744
4,0.06521,0.93479,0.889295,0.110705,0.980686,0.019314,1.06481,0.877894,0.103967,0.018139


In [148]:
log_loss(train_y,tmp_data[['low1_1','med1_1','high1_1']])

0.62095057924051367

In [149]:
test_gmean.shape

(74659, 6)

In [150]:
test_mean_sub = pd.DataFrame(test_mean)

In [151]:
sub_test_mean = test_mean_sub[[1,3,5]]
sub_test_mean.shape

(74659, 3)

In [152]:
now = datetime.now()
sub_name = '../../output/sub_FM_mean_BM_MB_add_desc_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = sub_test_mean
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X.listing_id.values
out_df.to_csv(sub_name, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [158]:
train_data = pd.DataFrame(tmp, columns = ['low0','low1','med0','med1','high0','high1'])
train_data['sum1'] = train_data['low1'] + train_data['med1']+ train_data['high1']
train_data['low1_1'] = train_data['low1'] / train_data['sum1']
train_data['med1_1'] = train_data['med1'] / train_data['sum1']
train_data['high1_1'] = train_data['high1'] / train_data['sum1']
train_blend = np.array(train_data[['low1_1','med1_1','high1_1']])

In [161]:
test_mean_data = pd.DataFrame(test_mean, columns = ['low0','low1','med0','med1','high0','high1'])
test_mean_data['sum1'] = test_mean_data['low1'] + test_mean_data['med1']+ test_mean_data['high1']
test_mean_data['low1_1'] = test_mean_data['low1'] / test_mean_data['sum1']
test_mean_data['med1_1'] = test_mean_data['med1'] / test_mean_data['sum1']
test_mean_data['high1_1'] = test_mean_data['high1'] / test_mean_data['sum1']
test_mean_blend = np.array(test_mean_data[['low1_1','med1_1','high1_1']])

In [169]:
test_gmean_data = pd.DataFrame(test_gmean, columns = ['low0','low1','med0','med1','high0','high1'])
test_gmean_data['sum1'] = test_gmean_data['low1'] + test_gmean_data['med1']+ test_gmean_data['high1']
test_gmean_data['low1_1'] = test_gmean_data['low1'] / test_gmean_data['sum1']
test_gmean_data['med1_1'] = test_gmean_data['med1'] / test_gmean_data['sum1']
test_gmean_data['high1_1'] = test_gmean_data['high1'] / test_gmean_data['sum1']
test_gmean_blend = np.array(test_gmean_data[['low1_1','med1_1','high1_1']])

In [170]:
test_gmean_blend.shape

(74659, 3)

In [175]:
name_train_blend = '../../output/train_blend_FM_BM_MB_add_desc_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../../output/test_blend_FM_mean_BM_MB_add_desc_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_gmean = '../../output/test_blend_FM_gmean_BM_MB_add_desc_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'


# print (np.mean(blend_scores_xgb,axis=0))
# print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,train_blend, delimiter=",")
np.savetxt(name_test_blend_mean,test_mean_blend, delimiter=",")
np.savetxt(name_test_blend_gmean,test_gmean_blend, delimiter=",")

In [164]:
out_df.head()

Unnamed: 0,low,medium,high,listing_id
0,0.386762,0.378102,0.169965,7142618
1,0.977577,0.044951,0.010625,7210040
2,0.970149,0.075394,0.031292,7174566
3,0.716298,0.247062,0.064973,7191391
4,0.628346,0.345032,0.05252,7171695


In [171]:
test_gmean_data.head()

Unnamed: 0,low0,low1,med0,med1,high0,high1,sum1,low1_1,med1_1,high1_1
0,0.613034,0.386446,0.621443,0.377402,0.829876,0.169162,0.93301,0.414192,0.4045,0.181308
1,0.022209,0.977572,0.955045,0.044873,0.989374,0.010585,1.033031,0.946315,0.043438,0.010247
2,0.029633,0.970142,0.924576,0.075024,0.968706,0.031229,1.076395,0.901288,0.069699,0.029012
3,0.283653,0.716278,0.752922,0.247014,0.935022,0.064908,1.0282,0.696633,0.24024,0.063128
4,0.371513,0.62826,0.654775,0.344658,0.947474,0.052426,1.025344,0.612731,0.336139,0.05113


In [172]:
test_gmean_blend

array([[ 0.41419245,  0.4044996 ,  0.18130795],
       [ 0.94631494,  0.04343844,  0.01024663],
       [ 0.90128841,  0.06969916,  0.02901243],
       ..., 
       [ 0.89696797,  0.06031721,  0.04271482],
       [ 0.94487662,  0.0479009 ,  0.00722248],
       [ 0.53974265,  0.40173102,  0.05852633]])

In [168]:
0.970149/1.076835

0.900926325760214

In [19]:
X_train, X_val, y_train, y_val = train_test_split(train_X_trans, y_low, train_size=.80, random_state=2451234)
print (X_train.shape)
print (X_val.shape)
# xgtrain = xgb.DMatrix(X_train, label=y_train)

(39481, 522)
(9871, 522)


# Low 300
# med 500
# high 650

In [20]:
from tffm import TFFMClassifier

for n_epochs in [150,200,250,300,350]:
    model = TFFMClassifier(
        order=2, 
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=3e-3,epsilon = 1), 
        n_epochs=n_epochs, 
        batch_size=1024,
        init_std=0.001,
        reg=0.1,
        input_type='dense',
        seed=42
    )
    model.fit(X_train, y_train, show_progress=True)
    predictions = model.predict_proba(X_val)
    print('[n_epochs={}] log_loss: {}'.format(n_epochs, log_loss(y_val, predictions)))
    # this will close tf.Session and free resources
    model.destroy() 


100%|██████████| 150/150 [02:01<00:00,  1.24epoch/s]


[n_epochs=150] log_loss: 0.238953546445


100%|██████████| 200/200 [02:40<00:00,  1.25epoch/s]


[n_epochs=200] log_loss: 0.226145637034


100%|██████████| 250/250 [03:21<00:00,  1.24epoch/s]


[n_epochs=250] log_loss: 0.219946944642


100%|██████████| 300/300 [03:47<00:00,  1.39epoch/s]


[n_epochs=300] log_loss: 0.216544215726


100%|██████████| 350/350 [04:37<00:00,  1.24epoch/s]

[n_epochs=350] log_loss: 0.21453853663





In [21]:
from tffm import TFFMClassifier

for n_epochs in [400,450]:
    model = TFFMClassifier(
        order=2, 
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=3e-3,epsilon = 1), 
        n_epochs=n_epochs, 
        batch_size=1024,
        init_std=0.001,
        reg=0.1,
        input_type='dense',
        seed=42
    )
    model.fit(X_train, y_train, show_progress=True)
    predictions = model.predict_proba(X_val)
    print('[n_epochs={}] log_loss: {}'.format(n_epochs, log_loss(y_val, predictions)))
    # this will close tf.Session and free resources
    model.destroy() 


100%|██████████| 400/400 [05:49<00:00,  1.14epoch/s]


[n_epochs=400] log_loss: 0.213159294894


100%|██████████| 450/450 [06:29<00:00,  1.34epoch/s]

[n_epochs=450] log_loss: 0.212318720357





In [35]:
for n_epochs in [500,550,600,650]:
    model = TFFMClassifier(
        order=2, 
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=3e-3,epsilon = 1), 
        n_epochs=n_epochs, 
        batch_size=1024,
        init_std=0.001,
        reg=0.1,
        input_type='dense',
        seed=42
    )
    model.fit(X_train, y_train, show_progress=True)
    predictions = model.predict_proba(X_val)
    print('[n_epochs={}] log_loss: {}'.format(n_epochs, log_loss(y_val, predictions)))
    # this will close tf.Session and free resources
    model.destroy() 

100%|██████████| 500/500 [06:13<00:00,  1.33epoch/s]


[n_epochs=500] log_loss: 0.211527009557


100%|██████████| 550/550 [06:49<00:00,  1.34epoch/s]


[n_epochs=550] log_loss: 0.211340100337


100%|██████████| 600/600 [07:26<00:00,  1.34epoch/s]


[n_epochs=600] log_loss: 0.211545995756


100%|██████████| 650/650 [08:04<00:00,  1.35epoch/s]

[n_epochs=650] log_loss: 0.211279009451





In [36]:
for n_epochs in [700,750]:
    model = TFFMClassifier(
        order=2, 
        rank=10, 
        optimizer=tf.train.AdamOptimizer(learning_rate=3e-3,epsilon = 1), 
        n_epochs=n_epochs, 
        batch_size=1024,
        init_std=0.001,
        reg=0.1,
        input_type='dense',
        seed=42
    )
    model.fit(X_train, y_train, show_progress=True)
    predictions = model.predict_proba(X_val)
    print('[n_epochs={}] log_loss: {}'.format(n_epochs, log_loss(y_val, predictions)))
    # this will close tf.Session and free resources
    model.destroy() 


100%|██████████| 700/700 [09:01<00:00,  1.32epoch/s]


[n_epochs=700] log_loss: 0.211723860883


100%|██████████| 750/750 [09:37<00:00,  1.33epoch/s]

[n_epochs=750] log_loss: 0.21151920601





In [None]:
predictions

In [61]:
fold = 5
skf = KFold(n_splits=fold,random_state=seed)

for reg in [1]:
    scores = np.zeros ((fold))
    for i, (train_index, val_index) in enumerate(skf.split(train_X_trans)):
#         print ('reg:{0:.3f}\t fold:{1}'.format(reg, i+1))
        train_x_fold = train_X_trans[train_index]
        train_y_fold = y_h[train_index]
        val_x_fold = train_X_trans[val_index]
        val_y_fold = y_h[val_index]  


        model = TFFMClassifier(
            order=2, 
            rank=10, 
            optimizer=tf.train.AdamOptimizer(learning_rate=3e-3,epsilon = 1), 
            n_epochs=100, 
            batch_size=1024,
            init_std=0.001,
            reg= 0.01,
            input_type='dense',
            seed=42
        )
        
        print 
        model.fit(train_x_fold, train_y_fold, show_progress=False)
        predictions = model.predict_proba(val_x_fold)
        scores[i] = log_loss(val_y_fold, predictions)
        print('[reg={0:.3f}]\t log_loss: {1:.12f}\tfold:{2}'.format(reg, scores[i],i+1))
        # this will close tf.Session and free resources
        model.destroy()     

    print ('reg:{0:.3f}\t score:{1:.3f}'.format(reg, np.mean(scores)))    

[reg=1.000]	 log_loss: 0.323255033825	fold:1
[reg=1.000]	 log_loss: 0.293114717140	fold:2
[reg=1.000]	 log_loss: 0.276313674261	fold:3
[reg=1.000]	 log_loss: 0.295404785321	fold:4
[reg=1.000]	 log_loss: 0.333144691132	fold:5
reg:1.000	 score:0.304


In [29]:
train_index

array([    0,     1,     2, ..., 39479, 39480, 39481])