In [1]:
import datetime
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
import ml_metrics as metrics

In [6]:
dtype={'is_booking':bool,
        'srch_ci' : np.str_,
        'srch_co' : np.str_,
        'srch_adults_cnt' : np.int32,
        'srch_children_cnt' : np.int32,
        'srch_rm_cnt' : np.int32,
        'srch_destination_id':np.str_,
        'user_location_country' : np.str_,
        'user_location_region' : np.str_,
        'user_location_city' : np.str_,
        'hotel_cluster' : np.str_,
        'orig_destination_distance':np.float64,
        'date_time':np.str_,
        'hotel_market':np.str_}

In [7]:
df0 = pd.read_csv('data/train.csv',dtype=dtype, usecols=dtype, parse_dates=['date_time'] ,sep=',')
df0['year']=df0['date_time'].dt.year
train = df0.query('is_booking==True & year==2014')

In [8]:
del df0

# Feature Engineering 

In [9]:
train['srch_ci']=pd.to_datetime(train['srch_ci'],infer_datetime_format = True,errors='coerce')
train['srch_co']=pd.to_datetime(train['srch_co'],infer_datetime_format = True,errors='coerce')

train['month']= train['date_time'].dt.month
train['plan_time'] = ((train['srch_ci']-train['date_time'])/np.timedelta64(1,'D')).astype(float)
train['hotel_nights']=((train['srch_co']-train['srch_ci'])/np.timedelta64(1,'D')).astype(float)

In [10]:
m=train.orig_destination_distance.mean()
train['orig_destination_distance']=train.orig_destination_distance.fillna(m)
train.fillna(-1,inplace=True)
lst_drop=['date_time','srch_ci','srch_co']
train.drop(lst_drop,axis=1,inplace=True)

In [11]:
y=train['hotel_cluster']
X=train.drop(['hotel_cluster','is_booking','year'],axis=1)

In [12]:
del train

# Train model with all training data

In [13]:
dtype1={'srch_ci' : np.str_,
        'srch_co' : np.str_,
        'srch_adults_cnt' : np.int32,
        'srch_children_cnt' : np.int32,
        'srch_rm_cnt' : np.int32,
        'srch_destination_id':np.str_,
        'user_location_country' : np.str_,
        'user_location_region' : np.str_,
        'user_location_city' : np.str_,
        'orig_destination_distance':np.float64,
        'date_time':np.str_,
        'hotel_market':np.str_}

In [14]:
test1 = pd.read_csv('data/test.csv',dtype=dtype1,usecols=dtype1,parse_dates=['date_time'] ,sep=',')
test1['srch_ci']=pd.to_datetime(test1['srch_ci'],infer_datetime_format = True,errors='coerce')
test1['srch_co']=pd.to_datetime(test1['srch_co'],infer_datetime_format = True,errors='coerce')

test1['month']=test1['date_time'].dt.month
test1['plan_time'] = ((test1['srch_ci']-test1['date_time'])/np.timedelta64(1,'D')).astype(float)
test1['hotel_nights']=((test1['srch_co']-test1['srch_ci'])/np.timedelta64(1,'D')).astype(float)

n=test1.orig_destination_distance.mean()
test1['orig_destination_distance']=test1.orig_destination_distance.fillna(m)
test1.fillna(-1,inplace=True)

In [15]:
lst_drop=['date_time','srch_ci','srch_co']
test1.drop(lst_drop,axis=1, inplace=True)
rf_all = RandomForestClassifier(n_estimators=31,max_depth=10,random_state=123)
rf_all.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=31, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

In [16]:
del X,y

# Predict on all test data

In [17]:
def predict_test_file(trained_model, df_feature_engineered_test_data):
    lst_y_predicted = []
    for i in range(6):
        print 'batch',i+1, '/7'
        y_pred = trained_model.predict_proba(df_feature_engineered_test_data.iloc[i*450000:(i+1)*450000,:])
        top_5_for_this_batch = y_pred.argsort(axis=1)[:,-5:]
        lst_y_predicted.append(top_5_for_this_batch)
    dict_cluster = {}
    print 'getting cluster names...'
    for (k,v) in enumerate(trained_model.classes_):
        dict_cluster[k] = v
    print 'translating to hotel clusters...'
    b = []
    for i in np.vstack(lst_y_predicted).flatten():
        b.append(dict_cluster.get(i))
    predict_class=np.array(b).reshape(np.vstack(lst_y_predicted).shape)
    predict_class=map(lambda x: ' '.join(map(str,x)), predict_class)
    print 'creating submit file...'
    df_submission_sample = pd.DataFrame.from_csv('data/sample_submission.csv')
    df_submission_sample['hotel_cluster'] = predict_class
    df_submission_sample.to_csv('file_to_submit.csv')
    print 'job done. file_to_submit.csv ready in your expedia folder.'

### 

In [18]:
predict_test_file(rf_all, test1)

batch 1 /7
batch 2 /7
batch 3 /7
batch 4 /7
batch 5 /7
batch 6 /7
getting cluster names...
translating to hotel clusters...
creating submit file...
job done. file_to_submit.csv ready in your expedia folder.
