In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_json('Data/train.json')
test = pd.read_json('Data/test.json')
test_ids = test.id

> na values could still be valuable information to learn, we don't want to miss it

In [3]:
train['inc_angle'] = pd.to_numeric(train['inc_angle'], errors='coerce')
test['inc_angle'] = pd.to_numeric(test['inc_angle'], errors='coerce')
train['inc_angle'].fillna(value=0., inplace=True)

In [4]:
import gc
train = train[['inc_angle', 'is_iceberg']]
test = test[['inc_angle']]
gc.collect()
print(train.shape)

(1604, 2)


> to use value counts, we have to round it to a specific range

In [5]:
def round_angle(ang):
    return round(ang, 4)

In [6]:
train['inc_angle'] = train['inc_angle'].apply(round_angle)
test['inc_angle'] = test['inc_angle'].apply(round_angle)

In [7]:
train_value_counts = train.groupby(by='is_iceberg').inc_angle.value_counts()
print(train_value_counts)

is_iceberg  inc_angle
0           0.0000       133
            35.7863        9
            34.9664        6
            35.7829        6
            42.5128        6
            35.6291        5
            42.5145        5
            38.9177        4
            39.6636        4
            41.8582        4
            43.2159        4
            45.2393        4
            32.4245        3
            34.0467        3
            34.1367        3
            34.9528        3
            37.2428        3
            38.1366        3
            38.1403        3
            38.2635        3
            41.9264        3
            42.4133        3
            42.4142        3
            42.5075        3
            43.1096        3
            43.2207        3
            43.9194        3
            43.9873        3
            44.5876        3
            31.3432        2
                        ... 
1           43.9407        1
            43.9415        1
            43.9466  

In [8]:
print(train_value_counts[0].iloc[:10])

inc_angle
0.0000     133
35.7863      9
34.9664      6
35.7829      6
42.5128      6
35.6291      5
42.5145      5
38.9177      4
39.6636      4
41.8582      4
Name: inc_angle, dtype: int64


In [9]:
print(train_value_counts[1].iloc[:10])

inc_angle
34.4721    23
42.5591    16
33.6352    15
36.1061    15
39.2340    13
38.4755    11
39.2166    11
39.9784    11
45.2814    11
34.4709    10
Name: inc_angle, dtype: int64


In [10]:
counter = 0
for ang in train.inc_angle.values:
    if ang in test.inc_angle.values:
        counter+=1
        
print('inc_angle in train also in test:', counter)

inc_angle in train also in test: 1041


In [11]:
counter = 0
for ang in test.inc_angle.values:
    if ang in train.inc_angle.values:
        counter+=1
        
print('inc_angle in test also in train:', counter)

inc_angle in test also in train: 1974


In [12]:
counter = 0

for ang in test.inc_angle:
    if (ang in train_value_counts[0].index) or (ang in train_value_counts[1].index):
        counter+=1
        
print('inc_angle in test also in train:', counter)

inc_angle in test also in train: 1974


## Transform this value counts into uncertainties: 1/sqrt(n+1), I create two features for is_iceberg being 1. and 0.

In [13]:
import math

def get_uncertainty(row):
    ang = row['inc_angle']

    if ang in train_value_counts[0].index:
        row['inc_angle_neg'] = 1. / math.sqrt(train_value_counts[0].loc[ang] + 1.)
        
    if ang in train_value_counts[1].index:
        row['inc_angle_pos'] = 1. / math.sqrt(train_value_counts[1].loc[ang] + 1.)
        
    return row

In [14]:
train['inc_angle_neg'] = np.ones((train.shape[0],))
train['inc_angle_pos'] = np.ones((train.shape[0],))
test['inc_angle_neg'] = np.ones((test.shape[0],))
test['inc_angle_pos'] = np.ones((test.shape[0],))

train = train.apply(get_uncertainty, axis=1)
test = test.apply(get_uncertainty, axis=1)

In [15]:
print(train.head(5))
print(test.head(5))

   inc_angle  is_iceberg  inc_angle_neg  inc_angle_pos
0    43.9239         0.0       0.707107       1.000000
1    38.1562         0.0       0.707107       1.000000
2    45.2859         1.0       1.000000       0.301511
3    43.8306         0.0       0.707107       1.000000
4    35.6256         0.0       0.707107       1.000000
   inc_angle  inc_angle_neg  inc_angle_pos
0    34.9664       0.377964            1.0
1    32.6151       1.000000            1.0
2    37.5054       1.000000            1.0
3    34.4739       1.000000            1.0
4    43.9189       1.000000            1.0


## Use xgboost to train and see what's the best it can perform

In [13]:
import xgboost
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

In [14]:
class base_tuner():
    def __init__(self, X, y, cv=3):
        self.X = X
        self.y = y
        self.default_params = {}
        self.cv = cv
        
    def fit_and_update_params(self, params, update=True):
        clf = self.get_clf()

        gs = GridSearchCV(clf, params, scoring='neg_log_loss', cv=self.cv, return_train_score=False)
        gs.fit(self.X, self.y)
        
        cv_df = pd.DataFrame().from_dict(gs.cv_results_)
        cv_df = cv_df[['mean_test_score', 'std_test_score', 'params', 'rank_test_score']]
        cv_df = cv_df.sort_values(by=['rank_test_score', 'std_test_score']).reset_index(drop=True)
        best_params = cv_df.loc[0, 'params']
        
        if update is True:
            self.default_params.update(best_params)
        
        print('Selected hyper-params:', best_params)
        print('==============================> cv score: {:.4f}'.format(cv_df.loc[0, 'mean_test_score']))
        return best_params
    
    def tune(self):
        pass
    
    def get_clf(self):
        return None

In [15]:
class xgb_tuner(base_tuner):
    def __init__(self, X, y, cv=3):
        super(xgb_tuner, self).__init__(X, y, cv=cv)
        self.X = X
        self.y = y
        self.default_params = {
            'n_jobs': 4,
            'objective': 'binary:logistic',
            'seed': 0,
            'eval_metric': 'logloss'
        }
    
    def tune_booster(self):
        params = {
            'booster': ['dart', 'gbtree']
        }
        self.fit_and_update_params(params)
        
    def tune_est_num_and_lr(self):
        params = {
            'n_estimators': [100, 200, 400, 800],
            'learning_rate': [0.1, 0.05, 0.01, 0.005]
        }

        self.fit_and_update_params(params)
   
    def tune_max_depth(self):
        params = {
            'max_depth': [1, 3, 5, 7, 9]
        }
        self.fit_and_update_params(params)
        
    def tune_child_w_and_gamma(self):
        params = {
            'min_child_weight': [1, 2, 4, 6, 8, 10],
            'gamma': [0, 0.1, 0.2]
        }
        self.fit_and_update_params(params)
     
    def tune_sampling(self):
        params = {
            'subsample': [1., .8, .6, .4, .2],
            'colsample_bytree': [1., .8, .6, .4, .2]
        }
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
        
    def tune_regularization(self):
        params = {
            'reg_alpha': [1., .8, .6, .4, .2, .0],
            'reg_lambda': [1., .8, .6, .4, .2, .0]
        }
        
        best_parmas = self.fit_and_update_params(params, update=False)
        
        next_params = {}
        for k,v in best_parmas.items():
            if v == 1.:
                next_params[k] = [1., .95, .9, .85]
            elif v == 0.:
                next_params[k] = [.0, .05, .1, .15]
            else:
                next_params[k] = [v+.15, v+.1, v+.05, v, v-.05, v-.1, v-.15]
 
        self.fit_and_update_params(next_params)
    
    def tune(self):
        print('xgb tuner start tuning')
        self.tune_booster()
        self.tune_est_num_and_lr()
        self.tune_max_depth()
        self.tune_child_w_and_gamma()
        self.tune_sampling()
        self.tune_regularization()
        
        return self.get_clf()
    
    def get_clf(self):
        return xgboost.XGBClassifier(**self.default_params)

In [16]:
kfold = KFold(5, shuffle=True, random_state=999)
tuner = xgb_tuner(train[['inc_angle_neg', 'inc_angle_pos']], train['is_iceberg'], cv=kfold)

In [17]:
tuned_clf = tuner.tune()

xgb tuner start tuning
Selected hyper-params: {'booster': 'dart'}
Selected hyper-params: {'learning_rate': 0.01, 'n_estimators': 800}
Selected hyper-params: {'max_depth': 1}
Selected hyper-params: {'gamma': 0, 'min_child_weight': 1}
Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 1.0}
Selected hyper-params: {'colsample_bytree': 1.0, 'subsample': 0.95}
Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.8}
Selected hyper-params: {'reg_alpha': 0.0, 'reg_lambda': 0.9}


In [18]:
tuned_clf.fit(train[['inc_angle_neg', 'inc_angle_pos']], train['is_iceberg'])
inc_angle_answers = tuned_clf.predict_proba(test[['inc_angle_neg', 'inc_angle_pos']])[:,1]
print(inc_angle_answers.tolist())

[0.001256680814549327, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.001256680814549327, 0.9978185892105103, 0.3092672526836395, 0.001256680814549327, 0.001256680814549327, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.001256680814549327, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.9987603425979614, 0.3092672526836395, 0.9987603425979614, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.001256680814549327, 0.3092672526836395, 0.3092672526836395, 0.9978185892105103, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.9987603425979614, 0.001256680814549327, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.3092672526836395, 0.9987603425979614, 0.3092672526836395, 0.3092672526836395, 0.9987238049507141, 0.309

In [19]:
submission = pd.DataFrame()
submission['id'] = test_ids
submission['is_iceberg'] = inc_angle_answers
submission.to_csv('Submissions/inc_angle_answer.csv', index=False, float_format="%.15f")

> 0.302 on public lb, local cv: 0.0093 (clearly overfitted, ha...), but we can be sure that we can get some information gain from inc angle information

## Another transform: Heuristic one

In [16]:
cut_hi = .8
cut_lo = .2

def dist(end, count):
    return end - (end-.5)/(count+1)
    
def heuristic_transform(ang):

    if (ang in train_value_counts[0].index) and (ang not in train_value_counts[1].index):
        t_ang = dist(cut_lo, train_value_counts[0].loc[ang])
    elif (ang in train_value_counts[1].index) and (ang not in train_value_counts[0].index):
        t_ang = dist(cut_hi, train_value_counts[1].loc[ang])
    else:
        if (ang not in train_value_counts[0].index) and (ang not in train_value_counts[1].index):
            t_ang = .5
        elif train_value_counts[0].loc[ang] == train_value_counts[1].loc[ang]:
            t_ang = .5
        else:
            t_ang = dist(cut_hi, train_value_counts[0].loc[ang])
            t_ang += dist(cut_hi, train_value_counts[1].loc[ang])
            t_ang /= 2.

    return t_ang

In [17]:
transformed_inc_angle_train = train['inc_angle'].apply(heuristic_transform)
transformed_inc_angle_test = test['inc_angle'].apply(heuristic_transform)

print(transformed_inc_angle_train)
print(transformed_inc_angle_test)

0       0.350000
1       0.350000
2       0.772727
3       0.350000
4       0.350000
5       0.762500
6       0.700000
7       0.350000
8       0.242857
9       0.350000
10      0.650000
11      0.350000
12      0.700000
13      0.750000
14      0.350000
15      0.350000
16      0.350000
17      0.350000
18      0.250000
19      0.757143
20      0.350000
21      0.650000
22      0.260000
23      0.766667
24      0.350000
25      0.778571
26      0.762500
27      0.300000
28      0.650000
29      0.350000
          ...   
1574    0.202239
1575    0.202239
1576    0.202239
1577    0.202239
1578    0.202239
1579    0.202239
1580    0.202239
1581    0.202239
1582    0.202239
1583    0.202239
1584    0.202239
1585    0.202239
1586    0.202239
1587    0.202239
1588    0.202239
1589    0.202239
1590    0.202239
1591    0.202239
1592    0.202239
1593    0.202239
1594    0.202239
1595    0.202239
1596    0.202239
1597    0.202239
1598    0.202239
1599    0.202239
1600    0.202239
1601    0.2022

In [18]:
train_inc_angle=pd.DataFrame()
train_inc_angle['inc_angle'] = transformed_inc_angle_train
train_inc_angle.to_csv('Data/inc_angle_train.csv', index=False, float_format="%.15f")

test_inc_angle=pd.DataFrame()
test_inc_angle['inc_angle'] = transformed_inc_angle_test
test_inc_angle.to_csv('Data/inc_angle_test.csv', index=False, float_format="%.15f")