## CS249 Project - Personalize Expedia Hotel Searches - ICDM 2013

### 1. Data Loading

In [1]:
#load data from csv files 
import pandas as pd
import numpy as np
import csv as csv
training_data = pd.DataFrame.from_csv('data/train.csv').reset_index()
testing_data = pd.DataFrame.from_csv('data/test.csv').reset_index()

In [2]:
print(training_data.columns.values)
training_data.shape

['srch_id' 'date_time' 'site_id' 'visitor_location_country_id'
 'visitor_hist_starrating' 'visitor_hist_adr_usd' 'prop_country_id'
 'prop_id' 'prop_starrating' 'prop_review_score' 'prop_brand_bool'
 'prop_location_score1' 'prop_location_score2' 'prop_log_historical_price'
 'position' 'price_usd' 'promotion_flag' 'srch_destination_id'
 'srch_length_of_stay' 'srch_booking_window' 'srch_adults_count'
 'srch_children_count' 'srch_room_count' 'srch_saturday_night_bool'
 'srch_query_affinity_score' 'orig_destination_distance' 'random_bool'
 'comp1_rate' 'comp1_inv' 'comp1_rate_percent_diff' 'comp2_rate'
 'comp2_inv' 'comp2_rate_percent_diff' 'comp3_rate' 'comp3_inv'
 'comp3_rate_percent_diff' 'comp4_rate' 'comp4_inv'
 'comp4_rate_percent_diff' 'comp5_rate' 'comp5_inv'
 'comp5_rate_percent_diff' 'comp6_rate' 'comp6_inv'
 'comp6_rate_percent_diff' 'comp7_rate' 'comp7_inv'
 'comp7_rate_percent_diff' 'comp8_rate' 'comp8_inv'
 'comp8_rate_percent_diff' 'click_bool' 'gross_bookings_usd' 'booking_b

(9917530, 54)

In [3]:
print(testing_data.columns.values)
testing_data.shape

['srch_id' 'date_time' 'site_id' 'visitor_location_country_id'
 'visitor_hist_starrating' 'visitor_hist_adr_usd' 'prop_country_id'
 'prop_id' 'prop_starrating' 'prop_review_score' 'prop_brand_bool'
 'prop_location_score1' 'prop_location_score2' 'prop_log_historical_price'
 'price_usd' 'promotion_flag' 'srch_destination_id' 'srch_length_of_stay'
 'srch_booking_window' 'srch_adults_count' 'srch_children_count'
 'srch_room_count' 'srch_saturday_night_bool' 'srch_query_affinity_score'
 'orig_destination_distance' 'random_bool' 'comp1_rate' 'comp1_inv'
 'comp1_rate_percent_diff' 'comp2_rate' 'comp2_inv'
 'comp2_rate_percent_diff' 'comp3_rate' 'comp3_inv'
 'comp3_rate_percent_diff' 'comp4_rate' 'comp4_inv'
 'comp4_rate_percent_diff' 'comp5_rate' 'comp5_inv'
 'comp5_rate_percent_diff' 'comp6_rate' 'comp6_inv'
 'comp6_rate_percent_diff' 'comp7_rate' 'comp7_inv'
 'comp7_rate_percent_diff' 'comp8_rate' 'comp8_inv'
 'comp8_rate_percent_diff']


(6622629, 50)

<b>Training data have 'position', 'click_bool', 'gross_bookings_usd', 'booking_bool' 4 more features.</b>

### 2. Feature Engineering

In [4]:
# create target : click_score [0,1,2]
def generate_target(training_data):
    training_data['click_score'] = pd.Series(0, index=training_data.index)
    training_data.loc[training_data.click_bool == 1, 'click_score'] = 1
    training_data.loc[training_data.booking_bool == 1, 'click_score'] = 2
    #pd.Series.unique(training_data['click_score'])
    return training_data

In [5]:
# generate new features
def generate_features(frame):
    print('Generating features...')
    # feature 1, property price difference again its historical mean
    # need to take care of missing data of prop_log_historical_price, which is 0
    # use median number to replace that data before extracing this feature
    frame.loc[ (frame.prop_log_historical_price==0), 'prop_log_historical_price'] = np.nan
    median_plhp = frame.prop_log_historical_price.dropna().median() 
    frame.loc[ (frame.prop_log_historical_price.isnull()), 'prop_log_historical_price'] = median_plhp
    frame['f_prop_price_diff'] = frame['prop_log_historical_price'].map(np.exp) - frame['price_usd']

    # feature 2, the price difference of user historical mean value and the property price
    # need to tackle null value of visitor_hist_adr_usd
    # assign median value for this case
    median_vhau = frame.visitor_hist_adr_usd.dropna().median()
    frame.loc[(frame.visitor_hist_adr_usd.isnull()),'visitor_hist_adr_usd'] = median_vhau
    frame['f_user_price_diff'] = frame['visitor_hist_adr_usd'] - frame['price_usd']

    # feature 3, starrating difference between user historical mean starrating and property historical mean starrating
    # need to tackle null value of visitor_hist_starrating
    # assign mean value for this case
    mean_vhsr = frame['visitor_hist_starrating'].dropna().mean()
    frame.loc[(frame.visitor_hist_starrating.isnull()),'visitor_hist_starrating'] = mean_vhsr
    frame['f_starrating_diff'] = frame['visitor_hist_starrating'] - frame['prop_starrating']

    # feature 4, fee per person
    frame['f_per_fee'] = frame.price_usd * frame.srch_room_count / (frame.srch_adults_count + frame.srch_children_count)

    # feature 5, total fees
    frame['f_total_fee'] = frame.price_usd * frame.srch_room_count * frame.srch_length_of_stay

    # feature 6, overall price advantage indicator between expedia and competitors
    # the larger the more advantages against competitors
    frame['f_comp_rate'] = (frame.comp1_rate.fillna(0) + frame.comp2_rate.fillna(0) + \
                         frame.comp3_rate.fillna(0) + frame.comp4_rate.fillna(0) + \
                         frame.comp5_rate.fillna(0) + frame.comp6_rate.fillna(0) + \
                         frame.comp7_rate.fillna(0) + frame.comp8_rate.fillna(0)).astype(int)

    # feature 7, overall availability advantage indicator between expedia and competitors
    # the larger the more advantages against competitors
    frame['f_comp_inv'] = (frame.comp1_inv.fillna(0) + frame.comp2_inv.fillna(0) + \
                        frame.comp3_inv.fillna(0) + frame.comp4_inv.fillna(0) + \
                        frame.comp5_inv.fillna(0) + frame.comp6_inv.fillna(0) + \
                        frame.comp7_inv.fillna(0) + frame.comp8_inv.fillna(0)).astype(int)

    # feature 8, prop_location_score2 * srch_query_affinity_score
    median_pls2 = frame.prop_location_score2.dropna().median()
    mean_sqas = frame.srch_query_affinity_score.mean()
    
    
    ## try
    #frame.loc[(frame.prop_location_score2.isnull()),'prop_location_score2'] = median_pls2
    frame.loc[(frame.prop_location_score2.isnull()),'prop_location_score2'] = 0
    ## try
    frame.loc[(frame.srch_query_affinity_score.isnull()),'srch_query_affinity_score'] = -31
    #frame.loc[(frame.srch_query_affinity_score.isnull()),'srch_query_affinity_score'] = mean_sqas
    frame['f_score2ma'] = frame.prop_location_score2 * frame.srch_query_affinity_score

    # feature 9, score1 devide score2
    frame['f_score1d2'] = frame.prop_location_score2.map(lambda x : x + 0.0001) / frame.prop_location_score1.map(lambda x : x + 0.0001)

    # Other features we can use
    #  
    # --int64 typed features--use directly--
    # prop_id, prop_starrating, prop_brand_bool, promotion_flag, 
    # srch_booking_window, srch_saturday_night_bool, random_bool, srch_destination_id
    #  //generated features: f_comp_rate, f_comp_inv
    #
    # --float typed features--need normalization
    # price_usd, prop_location_score1, prop_location_score2, prop_review_score, orig_destination_distance
    # //generated features: f_prop_price_diff, f_user_price_diff, f_starrating_diff, f_per_fee, f_total_fee, f_score2ma, f_score1d2
    # 
    #normalize data in rage (-1,1)
    fnormalize = lambda x : (x - x.mean()) / (x.max() - x.min())

    frame['f_prop_location_score1'] = (frame[['prop_location_score1']].apply(fnormalize)*10//1.0).astype(int)

    frame['f_prop_location_score2'] = (frame[['prop_location_score2']].apply(fnormalize)*10//1.0).astype(int)
    #try
    frame.f_hotel_quality_1.fillna(0, inplace=True)
    frame.f_hotel_quality_2.fillna(0, inplace=True)
    frame['f_hotel_quality_1'] = (frame[['f_hotel_quality_1']].apply(fnormalize)*10/1.0)
    frame['f_hotel_quality_2'] = (frame[['f_hotel_quality_2']].apply(fnormalize)*10/1.0)
    
    
    median_prs = frame.prop_review_score.dropna().median()
    #try
    #frame.loc[(frame.prop_review_score.isnull()), 'prop_review_score'] = median_prs
    frame.loc[(frame.prop_review_score.isnull()), 'prop_review_score'] = 3.0
    frame.loc[frame.prop_review_score == 0, 'prop_review_score'] = 2.5
    frame['f_prop_review_score'] = (frame[['prop_review_score']].apply(fnormalize)*10//1.0).astype(int)

    median_odd = frame.orig_destination_distance.dropna().median()
    frame.loc[(frame.orig_destination_distance.isnull()),'orig_destination_distance'] = median_odd
    frame['f_orig_destination_distance'] = (frame[['orig_destination_distance']].apply(fnormalize)*10//1.0).astype(int)

    frame['ff_starrating_diff'] = (frame[['f_starrating_diff']].apply(fnormalize)*10//1.0).astype(int)

    frame['ff_score2ma'] = (frame[['f_score2ma']].apply(fnormalize)*10//1.0).astype(int)

    frame['ff_score1d2'] = (frame[['f_score1d2']].apply(fnormalize)*10//1.0).astype(int)

    # for any of the price-related feature, we have to pay spacial attention to the outliers before we normalize the data
    # that is, some prop has too high price which would dis-form our normalization
    # a way to handle those feature is to bin them with a uppper limit other than to normalize them

    CEILING = 1000
    BRACKET_SIZE = 50
    NUM_BRACKET = CEILING // BRACKET_SIZE

    frame['f_price_usd'] = (frame.price_usd//BRACKET_SIZE).clip_upper(NUM_BRACKET-1).astype(np.int)

    frame['ff_prop_price_diff'] = (frame.f_prop_price_diff//BRACKET_SIZE).clip_upper(NUM_BRACKET-1).clip_lower(-NUM_BRACKET).astype(np.int)

    frame['ff_user_price_diff'] = (frame.f_user_price_diff//BRACKET_SIZE).clip_upper(NUM_BRACKET-1).clip_lower(-NUM_BRACKET).astype(np.int)

    frame['ff_total_fee'] = (frame.f_total_fee//BRACKET_SIZE).clip_upper(NUM_BRACKET-1).astype(np.int)

    CEILING = 500
    BRACKET_SIZE = 25
    frame['ff_per_fee'] = (frame.f_per_fee//BRACKET_SIZE).clip_upper(NUM_BRACKET-1).astype(np.int)
    
    print('Finished generating features')
    return frame

In [6]:
#feature_names = list(training_data_local.columns)
# feature_names.remove("click_bool")
# feature_names.remove("booking_bool")
# feature_names.remove("gross_bookings_usd")
# feature_names.remove("date_time")
# feature_names.remove("position")

# feature_names = ['srch_id', 'prop_country_id', 'prop_id', 'prop_starrating', 'prop_brand_bool', \
#                  'promotion_flag', 'srch_booking_window', 'srch_saturday_night_bool',\
#                 'random_bool', 'srch_destination_id', 'f_comp_rate', 'f_comp_inv', \
#                 'f_prop_location_score1', 'f_prop_location_score2', 'f_prop_review_score', \
#                 'f_orig_destination_distance', 'ff_starrating_diff', 'ff_score2ma', \
#                 'ff_score1d2', 'f_price_usd', 'ff_prop_price_diff', 'ff_user_price_diff',
#                 'ff_total_fee']
feature_names = ['srch_id','prop_country_id', 'prop_id', 'prop_starrating', 'prop_brand_bool', \
                 'promotion_flag', 'srch_booking_window', 'srch_saturday_night_bool',\
                'random_bool', 'srch_destination_id', 'f_comp_rate', 'f_comp_inv', \
                'f_prop_location_score1', 'f_prop_location_score2', 'f_prop_review_score', \
                'f_orig_destination_distance', 'ff_starrating_diff', \
                'f_price_usd', 'ff_prop_price_diff', 'ff_user_price_diff',
                'ff_total_fee']

#feature_names_ori = ['srch_length_of_stay','srch_adults_count','srch_children_count', 'srch_room_count']
feature_names_ori = list(training_data.columns[:27])
feature_names_hotel_quality = ['f_hotel_quality_1','f_hotel_quality_2'] 
feature_names = feature_names + feature_names_ori + feature_names_hotel_quality


In [7]:
feature_names = list(set(feature_names))
feature_names.remove("date_time")
feature_names.remove("position")

In [8]:
print(len(feature_names))
feature_names

38


['ff_total_fee',
 'srch_id',
 'prop_review_score',
 'prop_country_id',
 'f_prop_location_score2',
 'prop_brand_bool',
 'site_id',
 'f_orig_destination_distance',
 'f_comp_rate',
 'srch_destination_id',
 'prop_id',
 'srch_saturday_night_bool',
 'visitor_hist_adr_usd',
 'ff_prop_price_diff',
 'prop_log_historical_price',
 'srch_room_count',
 'promotion_flag',
 'srch_length_of_stay',
 'random_bool',
 'srch_adults_count',
 'prop_starrating',
 'ff_user_price_diff',
 'price_usd',
 'orig_destination_distance',
 'srch_booking_window',
 'f_hotel_quality_2',
 'f_prop_review_score',
 'f_comp_inv',
 'f_prop_location_score1',
 'f_price_usd',
 'prop_location_score1',
 'ff_starrating_diff',
 'f_hotel_quality_1',
 'visitor_location_country_id',
 'srch_query_affinity_score',
 'prop_location_score2',
 'srch_children_count',
 'visitor_hist_starrating']

In [9]:
# # hotel quality feature : f_hotel_quality_1 and f_hotel_quality_2
# counter = 0
# for p_id in pd.Series.unique(training_data.prop_id):
#     hotel_quality_1 = training_data.loc[training_data.prop_id == p_id, 'click_bool'].sum()/training_data.loc[training_data.prop_id == p_id, 'srch_id'].count()
#     hotel_quality_2 = training_data.loc[training_data.prop_id == p_id, 'booking_bool'].sum()/training_data.loc[training_data.prop_id == p_id, 'srch_id'].count()
#     training_data.loc[training_data.prop_id == p_id,'f_hotel_quality_1'] = hotel_quality_1
#     training_data.loc[training_data.prop_id == p_id,'f_hotel_quality_2'] = hotel_quality_2
#     testing_data.loc[testing_data.prop_id == p_id,'f_hotel_quality_1'] = hotel_quality_1
#     testing_data.loc[testing_data.prop_id == p_id,'f_hotel_quality_2'] = hotel_quality_2
#     counter = counter + 1
#     if(counter % 10000 == 0):
#         print("counter: ",counter ,"pid: ",p_id)
# mean_quality_1 = testing_data.f_hotel_quality_1.dropna().mean()
# testing_data.loc[testing_data.f_hotel_quality_1.isnull(), 'f_hotel_quality_1'] = mean_quality_1
# mean_quality_2 = testing_data.f_hotel_quality_2.dropna().mean()
# testing_data.loc[testing_data.f_hotel_quality_2.isnull(), 'f_hotel_quality_2'] = mean_quality_2
count_map_book = {}
count_map_click = {}
count_map = {}
training_data['f_hotel_quality_1'] = np.nan
training_data['f_hotel_quality_2'] = np.nan
testing_data['f_hotel_quality_1'] = np.nan
testing_data['f_hotel_quality_2'] = np.nan
#first scan
counter = 0
for row in training_data.itertuples():
    key = row.prop_id
    if(key not in count_map_book and row.booking_bool == 1):
        count_map_book[key] = 1
    elif(row.booking_bool == 1):
        count_map_book[key] += 1
    if(key not in count_map_click and row.click_bool == 1):
        count_map_click[key] = 1
    elif(row.click_bool == 1):
        count_map_click[key] += 1  
    if(key not in count_map):
        count_map[key] = 1
    else:
        count_map[key] += 1  
    counter += 1
    if(counter % 1000000 == 0):
        print("counter = ",counter)
print('map constructed')            
# for i in range(training_data.shape[0]):
#     key = training_data.iloc[i].prop_id
#     if(key not in count_map_book):
#         count_map_book[key] = 1
#     elif(training_data.iloc[i].booking_bool == 1):
#         count_map_book[key] += 1
#     if(key not in count_map_click):
#         count_map_click[key] = 1
#     elif(training_data.iloc[i].click_bool == 1):
#         count_map_click[key] += 1  
#     if(key not in count_map):
#         count_map[key] = 1
#     else:
#         count_map[key] += 1  
#     counter += 1
#     if(counter % 100000 == 0):
#         print("counter = ",counter)
counter = 0
for row in training_data.itertuples():
    key = row.prop_id
    index = row[0]
    if(key in count_map_click and key in count_map):
        training_data.set_value(index, 'f_hotel_quality_1' , count_map_click[key]/count_map[key])
    if(key in count_map_book and key in count_map):   
        training_data.set_value(index, 'f_hotel_quality_2' , count_map_book[key]/count_map[key])
    counter += 1
    if(counter % 1000000 == 0):
        print("counter = ",counter)
print('training data hotel quality feature created')
# for i in range(training_data.shape[0]):
#     key = training_data.iloc[i].prop_id
#     training_data.set_value(i, 'f_hotel_quality_1' , count_map_click[key]/count_map[key])
#     training_data.set_value(i, 'f_hotel_quality_2' , count_map_book[key]/count_map[key])
#     counter += 1
#     if(counter % 100000 == 0):
#         print("counter = ",counter)
# print('training data hotel quality feature created')
counter = 0
for row in testing_data.itertuples():
    key = row.prop_id
    index = row[0]
    if(key in count_map_click and key in count_map):
        testing_data.set_value(index, 'f_hotel_quality_1' , count_map_click[key]/count_map[key])
    if(key in count_map_book and key in count_map):   
        testing_data.set_value(index, 'f_hotel_quality_2' , count_map_book[key]/count_map[key])
    counter += 1
    if(counter % 1000000 == 0):
        print("counter = ",counter)
print('testing data hotel quality feature created')
# for i in range(testing_data.shape[0]):
#     key = testing_data.iloc[i].prop_id
#     testing_data.set_value(i, 'f_hotel_quality_1' , count_map_click[key]/count_map[key])
#     testing_data.set_value(i, 'f_hotel_quality_2' , count_map_book[key]/count_map[key])
#     counter += 1
#     if(counter % 100000 == 0):
#         print("counter = ",counter)
# print('testing data hotel quality feature created')


counter =  1000000
counter =  2000000
counter =  3000000
counter =  4000000
counter =  5000000
counter =  6000000
counter =  7000000
counter =  8000000
counter =  9000000
map constructed
counter =  1000000
counter =  2000000
counter =  3000000
counter =  4000000
counter =  5000000
counter =  6000000
counter =  7000000
counter =  8000000
counter =  9000000
training data hotel quality feature created
counter =  1000000
counter =  2000000
counter =  3000000
counter =  4000000
counter =  5000000
counter =  6000000
testing data hotel quality feature created


In [10]:
# # local testing starts here
# from sklearn.cross_validation import train_test_split
# training_data_local, testing_data_local = train_test_split(training_data, test_size = 0.1)
# training_data_local = generate_features(training_data_local)
# training_data_local = generate_target(training_data_local)
# testing_data_local = generate_features(testing_data_local)
# # only available for local dataset
# testing_data_local = generate_target(testing_data_local)
# For unknown testing dataset
testing_data = generate_features(testing_data)
training_data = generate_features(training_data)
training_data = generate_target(training_data)

Generating features...
Finished generating features
Generating features...
Finished generating features


In [11]:
# split data by country
# from sklearn.ensemble import RandomForestClassifier
# country_list = list(training_data.prop_country_id.unique())
# country_forest_dict = {country : RandomForestClassifier(n_estimators=100, n_jobs=-1) for country in country_list}
# country_train_dict = {country : training_data_local[training_data_local.prop_country_id == country] for country in country_list}
# country_test_local_dict = {country : testing_data_local[testing_data_local.prop_country_id == country] for country in country_list}
# country_test_dict = {country : testing_data[testing_data.prop_country_id == country] for country in country_list}

In [12]:
# country data balancing 
# balanced_country_train_dict = {country : (data[data.click_score==0].sample(len(data[data.click_score!=0])+1)).append(data[data.click_score!=0]) for country, data in country_train_dict.items()} 
# whole data balancing
# balanced_training_data_local = training_data_local[training_data_local.click_score==0].sample(len(training_data_local[training_data_local.click_score!=0])+1).append(training_data_local[training_data_local.click_score!=0]) 
# whole global data balancing
balanced_training_data = training_data[training_data.click_score==0].sample(len(training_data[training_data.click_score!=0])+1).append(training_data[training_data.click_score!=0]) 

In [13]:
# # local dataset feature and target extraction
# training_features_local = balanced_training_data_local.get(feature_names)
# testing_features_local = testing_data_local.get(feature_names)
# training_target_local = balanced_training_data_local.get(['click_score'])
# testing_target_local = testing_data_local.get(['click_score'])
# whole dataset feature and target extraction
training_features = balanced_training_data.get(feature_names)
testing_features = testing_data.get(feature_names)
training_target = balanced_training_data.get(['click_score'])

In [None]:
#training_features.isnull().sum()

### 3. RandomForest Classifier

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
clf = RandomForestClassifier(n_estimators=300, 
                                        verbose=1,
                                        n_jobs=4,
                                        min_samples_split=5,
                                        random_state=1)
# get lower score 0.39299 =.= gao mao
# params = {
#     'min_samples_split': [2,5,10],
#     'max_depth': [3,4,5]
# }
# clf = GridSearchCV(clf, params)

# can not use bulit-in cv method for this large dataset(entire dataset)
#scores = cross_val_score(clf, training_features, training_target.values, scoring = 'accuracy', cv=10)
#print(scores.min(), scores.mean(), scores.max())
#clf.fit(training_features_local, training_target_local.values.ravel())
clf.fit(training_features, training_target.values.ravel())
predict_values = clf.predict(testing_features)
print(feature_names)


In [None]:
predict_values_proba = clf.predict_proba(testing_features)

In [None]:
print(predict_values[1:10])
print(predict_values_proba[1:10])
predict_final = 4 * predict_values_proba[:,2] + predict_values_proba[:,1]
predict_final[1:10]

In [None]:
# sort and write to file
recommendations = zip(testing_features.srch_id.values, testing_features.prop_id.values, predict_final*(-1))
from operator import itemgetter
rows = [(srch_id, prop_id) for srch_id, prop_id, output in sorted(recommendations, key=itemgetter(0,2))]
print('write to csv')
writer = csv.writer(open('result_RF_proba.csv', "w"), lineterminator="\n")
writer.writerow(("SearchId", "PropertyId"))
writer.writerows(rows)
print('Finish writing')

In [None]:
print(feature_names)
clf.feature_importances_

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline   

importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
feature_names_arr = np.array(feature_names)

plt.figure()
plt.title("Feature importances")
plt.bar(range(len(feature_names)), importances[indices])
plt.xticks(range(len(feature_names)), feature_names_arr[indices], rotation='vertical')
plt.xlim([-1, len(feature_names)])
plt.show()

In [None]:
#feature_names = feature_names_arr[indices][:20]

### 4. LogisticRegression

In [55]:
%%time 
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
clf_log = LogisticRegression(           verbose=2,
                                        n_jobs=4,
                                        random_state=1,
                                        solver = 'sag'
                            )
params = {
    'C': [1.0]
}

clf_log = GridSearchCV(clf_log, params, n_jobs=4)


#clf_log.fit(training_features_local, training_target_local.values.ravel())
clf_log.fit(training_features, training_target.values.ravel())
#predict_values_log = clf_log.predict(testing_features)
predict_values_log_proba = clf_log.predict_proba(testing_features)
predict_log_final = 4 * predict_values_log_proba[:,2] + predict_values_log_proba[:,1]

[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:   57.2s finished
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:   59.7s finished
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:   59.6s finished
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:   59.5s finished


max_iter reached after 54 seconds
max_iter reached after 54 seconds
max_iter reached after 57 seconds
CPU times: user 2min 50s, sys: 2.36 s, total: 2min 52smax_iter reached after 57 seconds
max_iter reached after 57 seconds
max_iter reached after 57 seconds
max_iter reached after 58 secondsmax_iter reached after 58 secondsmax_iter reached after 58 seconds


max_iter reached after 60 secondsmax_iter reached after 59 secondsmax_iter reached after 59 seconds



Wall time: 2min 6s


In [56]:
# sort and write to file
recommendations = zip(testing_features.srch_id.values, testing_features.prop_id.values, predict_log_final*(-1))
from operator import itemgetter
rows = [(srch_id, prop_id) for srch_id, prop_id, output in sorted(recommendations, key=itemgetter(0,2))]
print('write to csv')
writer = csv.writer(open('result_LR_proba.csv', "w"), lineterminator="\n")
writer.writerow(("SearchId", "PropertyId"))
writer.writerows(rows)
print('Finish writing')

write to csv
Finish writing


In [None]:
#predict_values_proba = clf.predict_proba(testing_features)
#predict_values_log_proba = clf_log.predict_proba(testing_features)
#print(clf.score(testing_features_local, testing_target_local.values.ravel(),sample_weight=None))
#print(clf_log.score(testing_features_local, testing_target_local.values.ravel(),sample_weight=None))

### 5. Gradient Boosting

In [53]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV

clf_GB = GradientBoostingClassifier(n_estimators=1000, 
                                        verbose=0,
                                        #min_samples_split=10,
                                        random_state=1)
# params = {
#     'min_samples_split': [2,5,10],
#     'max_depth': [3,4,5]
# }

# clf_GB = GridSearchCV(clf_GB, params)
clf_GB.fit(training_features, training_target.values.ravel())
#predict_values_GB = clf_GB.predict(testing_features)
predict_values_GB_proba = clf_GB.predict_proba(testing_features)
predict_GB_final = 4 * predict_values_GB_proba[:,2] + predict_values_GB_proba[:,1]

In [54]:
# sort and write to file
recommendations = zip(testing_features.srch_id.values, testing_features.prop_id.values, predict_GB_final*(-1))
from operator import itemgetter
rows = [(srch_id, prop_id) for srch_id, prop_id, output in sorted(recommendations, key=itemgetter(0,2))]
print('write to csv')
writer = csv.writer(open('result_GB_proba.csv', "w"), lineterminator="\n")
writer.writerow(("SearchId", "PropertyId"))
writer.writerows(rows)
print('Finish writing')

write to csv
Finish writing


### 6. Extreme Randomized Trees

In [15]:
%%time
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
clf_ERF = ExtraTreesClassifier(n_estimators=200, 
                                        verbose=1,
                                        n_jobs=4,
                                        min_samples_split=2,
                                        random_state=1)

clf_ERF.fit(training_features, training_target.values.ravel())
predict_values = clf_ERF.predict(testing_features)
predict_values_ERF_proba = clf_ERF.predict_proba(testing_features)
predict_ERF_final = 4 * predict_values_ERF_proba[:,2] + predict_values_ERF_proba[:,1]

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   39.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:  3.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   57.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  4.3min
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:  4.5min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   57.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  4.3min
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:  4.5min finished


CPU times: user 43min 49s, sys: 3min 46s, total: 47min 36s
Wall time: 14min 9s


In [16]:
# sort and write to file
recommendations = zip(testing_features.srch_id.values, testing_features.prop_id.values, predict_ERF_final*(-1))
from operator import itemgetter
rows = [(srch_id, prop_id) for srch_id, prop_id, output in sorted(recommendations, key=itemgetter(0,2))]
print('write to csv')
writer = csv.writer(open('result_ERF_proba.csv', "w"), lineterminator="\n")
writer.writerow(("SearchId", "PropertyId"))
writer.writerows(rows)
print('Finish writing')

write to csv
Finish writing


### 7. NDCG matrix

In [None]:
"""Information Retrieval metrics

Useful Resources:
http://www.cs.utexas.edu/~mooney/ir-course/slides/Evaluation.ppt
http://www.nii.ac.jp/TechReports/05-014E.pdf
http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
http://hal.archives-ouvertes.fr/docs/00/72/67/60/PDF/07-busa-fekete.pdf
Learning to Rank for Information Retrieval (Tie-Yan Liu)
"""
import numpy as np


def mean_reciprocal_rank(rs):
    """Score is reciprocal of the rank of the first relevant item

    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).

    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75

    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)

    Returns:
        Mean reciprocal rank
    """
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


def r_precision(r):
    """Score is precision after all relevant documents have been retrieved

    Relevance is binary (nonzero is relevant).

    >>> r = [0, 0, 1]
    >>> r_precision(r)
    0.33333333333333331
    >>> r = [0, 1, 0]
    >>> r_precision(r)
    0.5
    >>> r = [1, 0, 0]
    >>> r_precision(r)
    1.0

    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)

    Returns:
        R Precision
    """
    r = np.asarray(r) != 0
    z = r.nonzero()[0]
    if not z.size:
        return 0.
    return np.mean(r[:z[-1] + 1])


def precision_at_k(r, k):
    """Score is precision @ k

    Relevance is binary (nonzero is relevant).

    >>> r = [0, 0, 1]
    >>> precision_at_k(r, 1)
    0.0
    >>> precision_at_k(r, 2)
    0.0
    >>> precision_at_k(r, 3)
    0.33333333333333331
    >>> precision_at_k(r, 4)
    Traceback (most recent call last):
        File "<stdin>", line 1, in ?
    ValueError: Relevance score length < k


    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)

    Returns:
        Precision @ k

    Raises:
        ValueError: len(r) must be >= k
    """
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)


def average_precision(r):
    """Score is average precision (area under PR curve)

    Relevance is binary (nonzero is relevant).

    >>> r = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1]
    >>> delta_r = 1. / sum(r)
    >>> sum([sum(r[:x + 1]) / (x + 1.) * delta_r for x, y in enumerate(r) if y])
    0.7833333333333333
    >>> average_precision(r)
    0.78333333333333333

    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)

    Returns:
        Average precision
    """
    r = np.asarray(r) != 0
    out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.
    return np.mean(out)


def mean_average_precision(rs):
    """Score is mean average precision

    Relevance is binary (nonzero is relevant).

    >>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1]]
    >>> mean_average_precision(rs)
    0.78333333333333333
    >>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1], [0]]
    >>> mean_average_precision(rs)
    0.39166666666666666

    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)

    Returns:
        Mean average precision
    """
    return np.mean([average_precision(r) for r in rs])


def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)

    Relevance is positive real values.  Can use binary
    as the previous methods.

    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114

    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]

    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)

    Relevance is positive real values.  Can use binary
    as the previous methods.

    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0

    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]

    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


if __name__ == "__main__":
    import doctest
    doctest.testmod()

In [None]:
predict_values_proba