# Data Science Olympics 2019, Paris and Berlin

The Data Science Olympics were held on May, 23rd 2019, simultaneously in Berlin and Paris.
Over 1000 data scientists participated. We had 2 hours to create the best predictive model.

In [1]:
# Libraries and options
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
%matplotlib inline 
pd.set_option('display.max_columns', 500)
#pd.set_option('display.height', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', -1)

In [2]:
# Define the competition scorer
def competition_scorer(y_true, y_pred):
    return log_loss(y_true, y_pred, sample_weight=10**y_true)

In [3]:
data_path = '.'

In [4]:
# Train data
requests = pd.read_csv('./train_requests.csv', sep=',', low_memory=False, error_bad_lines=False)
X_train = requests.loc[:, requests.columns != 'granted_number_of_nights']
y_train = requests['granted_number_of_nights']

# Test data
requests_test = pd.read_csv('./test_requests.csv', sep=',', low_memory=False, error_bad_lines=False)
X_test = requests_test

b'Skipping line 31303: expected 24 fields, saw 49\nSkipping line 75954: expected 24 fields, saw 49\n'


# Exploratory analysis

In [5]:
X_test.dtypes

request_id                       object
animal_presence                  object
answer_creation_date             object
child_situation                  int64 
child_to_come                    object
district                         int64 
group_composition_id             int64 
group_composition_label          object
group_creation_date              object
group_id                         object
group_main_requester_id          object
group_type                       object
housing_situation_id             int64 
housing_situation_label          object
long_term_housing_request        object
number_of_underage               int64 
request_backoffice_creator_id    object
request_creation_date            object
requester_type                   object
social_situation_id              object
town                             object
victim_of_violence               object
victim_of_violence_type          object
dtype: object

In [6]:
# Dimensions
print("Dimension X_train:", X_train.shape)
print("Dimension X_test:", X_test.shape)

Dimension X_train: (297739, 23)
Dimension X_test: (74189, 23)


In [7]:
y_train.value_counts()

0    145743
1    102201
2    46585 
3    3210  
Name: granted_number_of_nights, dtype: int64

In [8]:
X_train.info()
X_train.describe(include='all').T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297739 entries, 0 to 297738
Data columns (total 23 columns):
request_id                       297739 non-null object
animal_presence                  297739 non-null object
answer_creation_date             297739 non-null object
child_situation                  297739 non-null int64
child_to_come                    115453 non-null object
district                         297739 non-null int64
group_composition_id             297739 non-null int64
group_composition_label          297738 non-null object
group_creation_date              297738 non-null object
group_id                         297739 non-null object
group_main_requester_id          297739 non-null object
group_type                       297738 non-null object
housing_situation_id             297739 non-null int64
housing_situation_label          276802 non-null object
long_term_housing_request        90926 non-null object
number_of_underage               297739 non-null int64

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
request_id,297739,297739.0,8e091360305f9da031561b5eddb478ba,1.0,,,,,,,
animal_presence,297739,2.0,f,296908.0,,,,,,,
answer_creation_date,297739,277562.0,2019-01-01 07:00:00,92.0,,,,,,,
child_situation,297739,,,,-0.321184,4.08921,-1.0,-1.0,-1.0,-1.0,70.0
child_to_come,115453,2.0,f,107529.0,,,,,,,
district,297739,,,,7045.22,46442.4,1.0,35.0,61.0,81.0,315842.0
group_composition_id,297739,,,,27.9082,28.8707,-1.0,10.0,10.0,40.0,120.0
group_composition_label,297738,12.0,man alone,163208.0,,,,,,,
group_creation_date,297738,61766.0,2017-07-17 00:00:00.000,151.0,,,,,,,
group_id,297739,63140.0,26ea416385179d96f07aaca883dfeb19,81.0,,,,,,,


## Utility function for sampling

In [9]:
def vm_show_sample(df, n = 10, cols = None):
  if(cols == None):
    cols = df.columns
  print(df.loc[:, cols].sample(n))

In [10]:
vm_show_sample(X_train, 10)

                              request_id animal_presence  \
376     d06e3b0bb395a124f97c991797bd737c  f                
92138   79fcfeb5dcc3b103b9914e4d25cf0bca  f                
147785  561c14a0989a1438f0f3c133f5ae7b32  f                
23012   59c88514e2d287af5c47c49fb914ad0d  f                
87882   94b9f53a8a019b247f17f7d3d590bb60  f                
157458  43fce5852687ecdb7ac57fe7b22afd5b  f                
64365   eb17a5f53f3932dba2b0531cc0e76ff3  f                
123606  7f06179abf47b364c221c036c3dbd1c6  f                
212105  a3e67a7c3c195a06fb4d6aba873f6887  f                
16665   ed52383dd93187124c09499d8ede5318  f                

           answer_creation_date  child_situation child_to_come  district  \
376     2019-02-25 14:22:07.133 -1                f             81         
92138   2019-02-28 07:24:15.74  -1                NaN           46         
147785  2019-03-29 15:23:21.752 -1                NaN           22         
23012   2019-02-21 10:55:00.181 -1 

## Utility function for exploring unique values

In [11]:
# Explore top values for each column
def vm_show_value_counts(df, top_n):
  for col in df.columns:
    print('Top ' + str(top_n) + ' values for ' + col)
    vals = df[col].value_counts(dropna = False)
    if top_n != -1:
      vals = vals.iloc[:top_n]
    print(vals)
    print('\n')

In [12]:
vm_show_value_counts(X_train, 10)

Top 10 values for request_id
8e091360305f9da031561b5eddb478ba    1
cc8e0cf2b4dcddbd267e56a0fb057869    1
681d3109d479b6f5c42740a023a29a24    1
c3c78a413a797c294e113d869c76c9df    1
80923f71390138f73dbfb0935fdcc8be    1
7d20b674317274f02ac631c991c1524d    1
020177fbeadad774bc454de6813f67c6    1
74e1e52117def9efd091f65dfe186c7c    1
90620d5cb166fd996092107a6b6043ea    1
c867f44d93b8872c89e58658b6bc2b58    1
Name: request_id, dtype: int64


Top 10 values for animal_presence
f    296908
t    831   
Name: animal_presence, dtype: int64


Top 10 values for answer_creation_date
2019-01-01 07:00:00    92
2019-02-06 09:00:00    71
2019-02-05 09:00:00    66
2019-03-24 21:00:00    64
2019-02-04 09:00:00    63
2019-02-26 09:00:00    62
2019-02-12 08:00:00    60
2019-01-07 09:00:00    60
2019-01-11 09:00:00    59
2019-02-12 09:00:00    57
Name: answer_creation_date, dtype: int64


Top 10 values for child_situation
-1     285079
 10    10139 
 30    1047  
 20    702   
 70    304   
 40    277   
 5

In [13]:
def vm_count_unique(df):
    #res = df.loc[:, df.dtypes == 'object'].apply(lambda x : x.nunique())
    res = df.apply(lambda x : x.nunique())
    return res

In [14]:
vm_count_unique(X_train)

request_id                       297739
animal_presence                  2     
answer_creation_date             277562
child_situation                  8     
child_to_come                    2     
district                         102   
group_composition_id             13    
group_composition_label          12    
group_creation_date              61766 
group_id                         63140 
group_main_requester_id          60950 
group_type                       2     
housing_situation_id             22    
housing_situation_label          21    
long_term_housing_request        2     
number_of_underage               13    
request_backoffice_creator_id    1088  
request_creation_date            277725
requester_type                   3     
social_situation_id              63112 
town                             1168  
victim_of_violence               2     
victim_of_violence_type          6     
dtype: int64

# Preprocessing

## Choose features

In [15]:
#cat_cols = X_train.columns[X_train.dtypes == 'object']
my_cat_cols = my_cat_cols = ['animal_presence', 'child_situation', 'child_to_come', 'group_composition_id', 'group_type', 'housing_situation_id',
               'long_term_housing_request', 'number_of_underage', 'requester_type', 'victim_of_violence', 'victim_of_violence_type']

## Fill missing values in categorical variables

In [16]:
X_train = X_train.loc[:, my_cat_cols]
X_test = X_test.loc[:, my_cat_cols]

In [17]:
X_train.dtypes

animal_presence              object
child_situation              int64 
child_to_come                object
group_composition_id         int64 
group_type                   object
housing_situation_id         int64 
long_term_housing_request    object
number_of_underage           int64 
requester_type               object
victim_of_violence           object
victim_of_violence_type      object
dtype: object

In [18]:
for col in X_train.columns:
    if X_train[col].dtype == 'int64':
        X_train[col] = X_train[col].astype(str)
        X_test[col] = X_test[col].astype(str)

In [19]:
def vm_fill_missing(df, cols):
  res = df.copy(deep = True)
  for col in cols:
    res[col] = res[col].fillna('ZZmissing')
  return res

In [20]:
X_train = vm_fill_missing(X_train, my_cat_cols)
X_test = vm_fill_missing(X_test, my_cat_cols)

## Prepare weights and OH encoding

In [21]:
original_train_weights = 10 ** y_train

In [22]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X_train_enc = enc.fit_transform(X_train)
X_test_enc = enc.transform(X_test)

In [23]:
X_train_aug = np.concatenate([np.array(pd.DataFrame({'weight':original_train_weights})), X_train_enc.toarray()], axis = 1)

## CV split

In [24]:
from sklearn.model_selection import train_test_split

In [36]:
X_xgb_train_aug, X_xgb_cv_aug, y_xgb_train, y_xgb_cv = train_test_split(X_train_aug, y_train, test_size=0.2)

In [34]:
# First col iare the weights
X_xgb_train = X_xgb_train_aug[:, 1:]
X_xgb_cv = X_xgb_cv_aug[:, 1:]
train_weights = X_xgb_train_aug[:, 0]
cv_weights = X_xgb_cv_aug[:, 0]

# XGBoost

In [27]:
import xgboost as xgb

In [28]:
train_dmatrix = xgb.DMatrix(data = X_xgb_train, label = y_xgb_train, weight = train_weights)
cv_dmatrix = xgb.DMatrix(data = X_xgb_cv, label = y_xgb_cv, weight = cv_weights)

In [42]:
watchlist = [(train_dmatrix, 'train'), (cv_dmatrix, 'cv')]
def custom_eval(preds, dtrain):
    labels = dtrain.get_label()
    return 'competition_score', competition_scorer(labels, preds)

Only had time to explore most important hyperparameters. My best attempt in the short remaining time is the following.

In [79]:
param = {'num_class':4,
         'max_depth':4,
         'min_child_weight':1.2,
         #'eta':0.3,
         'silent':1,
         'objective':'multi:softprob',
         'eval_metric':'mlogloss',
         'n_estimators':1000}
num_round = 500

In [80]:
bst = xgb.train(param, train_dmatrix, num_round, watchlist, feval = custom_eval, early_stopping_rounds = 50)

[0]	train-mlogloss:1.2488	cv-mlogloss:1.2496	train-competition_score:1.2488	cv-competition_score:1.2496
Multiple eval metrics have been passed: 'cv-competition_score' will be used for early stopping.

Will train until cv-competition_score hasn't improved in 50 rounds.
[1]	train-mlogloss:1.16125	cv-mlogloss:1.16338	train-competition_score:1.16125	cv-competition_score:1.16338
[2]	train-mlogloss:1.10085	cv-mlogloss:1.1042	train-competition_score:1.10085	cv-competition_score:1.1042
[3]	train-mlogloss:1.05777	cv-mlogloss:1.0625	train-competition_score:1.05777	cv-competition_score:1.0625
[4]	train-mlogloss:1.02684	cv-mlogloss:1.03312	train-competition_score:1.02684	cv-competition_score:1.03312
[5]	train-mlogloss:1.00364	cv-mlogloss:1.01051	train-competition_score:1.00364	cv-competition_score:1.01051
[6]	train-mlogloss:0.985455	cv-mlogloss:0.9933	train-competition_score:0.985455	cv-competition_score:0.9933
[7]	train-mlogloss:0.97056	cv-mlogloss:0.979541	train-competition_score:0.97056	cv-comp

[72]	train-mlogloss:0.888326	cv-mlogloss:0.922833	train-competition_score:0.888325	cv-competition_score:0.922833
[73]	train-mlogloss:0.887885	cv-mlogloss:0.923071	train-competition_score:0.887885	cv-competition_score:0.923071
[74]	train-mlogloss:0.887773	cv-mlogloss:0.923124	train-competition_score:0.887773	cv-competition_score:0.923124
[75]	train-mlogloss:0.887552	cv-mlogloss:0.923233	train-competition_score:0.887552	cv-competition_score:0.923233
[76]	train-mlogloss:0.887432	cv-mlogloss:0.923374	train-competition_score:0.887432	cv-competition_score:0.923374
[77]	train-mlogloss:0.887193	cv-mlogloss:0.923237	train-competition_score:0.887193	cv-competition_score:0.923237
[78]	train-mlogloss:0.886756	cv-mlogloss:0.923467	train-competition_score:0.886756	cv-competition_score:0.923467
[79]	train-mlogloss:0.886591	cv-mlogloss:0.923486	train-competition_score:0.886591	cv-competition_score:0.923486
[80]	train-mlogloss:0.886275	cv-mlogloss:0.923402	train-competition_score:0.886275	cv-competitio

In [81]:
preds = bst.predict(cv_dmatrix)
competition_scorer(y_xgb_cv, preds)

0.9249671148676107

# Predict

In [82]:
test_dmatrix = xgb.DMatrix(data = X_test_enc)

In [83]:
y_test_predicted_probs = bst.predict(test_dmatrix)

# Submit

In [84]:
submission_df = pd.DataFrame({#'product_id':np.arange(X_test.shape[0]),
                              '0':requests_test['request_id'],
                              '1':y_test_predicted_probs[:, 0],
                              '2':y_test_predicted_probs[:, 1],
                              '3':y_test_predicted_probs[:, 2],
                              '4':y_test_predicted_probs[:, 3]})

In [85]:
submission_df.head(10)

Unnamed: 0,0,1,2,3,4
0,7a2f7215846392375b7e5f4a416298aa,0.004191,0.100471,0.783671,0.111668
1,f9388302f1a12ff58a9cabad201b24c4,0.040379,0.131103,0.6015,0.227018
2,482f122fbc7333eda62ca9efee6fbeb2,0.004483,0.121098,0.56392,0.310499
3,0676b42f6d679cfaee0e2633fc793a00,0.062231,0.091025,0.450553,0.396191
4,c857c384143351e1baf335648508e8c5,0.004251,0.195008,0.54924,0.251502
5,bd3a8a9baefd74922d8519693cfce0fe,0.060439,0.052258,0.485847,0.401456
6,96bf8268d973ad87a572893ee8378be3,0.027383,0.117979,0.525287,0.329351
7,09da6eb09639b4d0b9e87761ca585d23,0.040379,0.131103,0.6015,0.227018
8,c7311d1d63491975467283c6d6dad36f,0.039004,0.151976,0.572005,0.237015
9,34dc3716f07301c6bf1f4a062a669bf5,0.101141,0.025885,0.26139,0.611584


In [86]:
import io, math, requests

def submit_prediction(df, sep=',', comment='', compression='gzip', **kwargs):
    TOKEN='foo'
    URL='https://qscore.datascience-olympics.com/api/submissions'
    df.to_csv('temporary.dat', sep=sep, compression=compression, **kwargs)
    r = requests.post(URL, headers={'Authorization': 'Bearer {}'.format(TOKEN)},files={'datafile': open('temporary.dat', 'rb')},data={'comment':comment, 'compression': compression})
    if r.status_code == 429:
        raise Exception('Submissions are too close. Next submission is only allowed in {} seconds.'.format(int(math.ceil(int(r.headers['x-rate-limit-remaining']) / 1000.0))))
    if r.status_code != 200:
        raise Exception(r.text)

In [87]:
submit_prediction(submission_df, sep=',', index=False, comment='my submission')