# Ridge regression (with normalized input)

In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np

all = pd.read_csv('../expedia_data/all_data.csv')
print('Dataset has %d record'%len(all))
print('Dataset has %d search_id (accounts)'%len(all['srch_id'].unique().tolist()))

variables = [col for col in all.columns.unique().tolist() if col not in ['price_usd','date_time']]

Dataset has 908750 record
Dataset has 36518 search_id (accounts)


In [2]:
# get sampled data
def get_sampled_data(size, data):
    '''
    This function takes in the whole dataset, 
    and outout a sampled subset with data size specified
    '''
    interval_range = len(data)//size
    mid_idx_lst = []
    for i in range(1,size+1):
        mid_idx = (interval_range*(i-1) + interval_range*i)//2
        mid_idx_lst.append(mid_idx)

#     print(mid_idx_lst[0],mid_idx_lst[-1])
    data_sampled = data.iloc[mid_idx_lst]
    return data_sampled
    
sampled = get_sampled_data(5000, all)

In [6]:
### drop "datetime"
sampled = sampled.drop(columns = ['date_time'])
sampled.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,new_comp5_rate_percent_diff,new_comp6_rate,new_comp6_inv,new_comp6_rate_percent_diff,new_comp7_rate,new_comp7_inv,new_comp7_rate_percent_diff,new_comp8_rate,new_comp8_inv,new_comp8_rate_percent_diff
90,7086,24,216,4.5,745.42,220,131958,4,4.0,1,...,0,0,0,0,0,0,0,0,0,0
271,307902,24,216,,,164,138356,4,4.5,0,...,0,0,0,0,0,0,0,0,0,0
452,349474,24,216,3.28,107.96,215,50818,4,4.0,1,...,0,0,0,0,1,1,0,0,0,0
633,26168,32,220,,,219,91206,2,3.5,1,...,0,0,0,0,0,0,0,1,1,0
814,435038,24,216,,,196,99532,4,3.0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
### handle NA values
sampled = sampled.fillna(sampled.median())
sampled.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,new_comp5_rate_percent_diff,new_comp6_rate,new_comp6_inv,new_comp6_rate_percent_diff,new_comp7_rate,new_comp7_inv,new_comp7_rate_percent_diff,new_comp8_rate,new_comp8_inv,new_comp8_rate_percent_diff
90,7086,24,216,4.5,745.42,220,131958,4,4.0,1,...,0,0,0,0,0,0,0,0,0,0
271,307902,24,216,3.275,138.34,164,138356,4,4.5,0,...,0,0,0,0,0,0,0,0,0,0
452,349474,24,216,3.28,107.96,215,50818,4,4.0,1,...,0,0,0,0,1,1,0,0,0,0
633,26168,32,220,3.275,138.34,219,91206,2,3.5,1,...,0,0,0,0,0,0,0,1,1,0
814,435038,24,216,3.275,138.34,196,99532,4,3.0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
def split_data(data):
    '''
    This function takes in the whole data set and divide it into training, validation and test data;
    split into 0.8 and 0.2 first; second split using the same ratio into training and valiation
    '''
    training_size_large = int(len(data) * 0.8)   
    validation_size = int(training_size_large * 0.2)
    training_size = training_size_large - validation_size
    test_size = int(len(data) * 0.2)
    
    print('training size: %d'%training_size)
    print('validation size: %d'%validation_size)
    print('test size: %d'%test_size)
    
    # split data manually
    training_data = data[0: training_size]
    validation_data = data[training_size:(training_size + validation_size)]
    test_data = data[(training_size + validation_size): (training_size + validation_size + test_size)]
    
    return training_data, validation_data, test_data
    
training, validation, test = split_data(sampled)


X_train = training[variables].values
y_train = training['price_usd'].values

X_val = validation[variables].values
y_val = validation['price_usd'].values

X_test = test[variables].values
y_test = test['price_usd'].values

training size: 3200
validation size: 800
test size: 1000


In [24]:
# normalize data before fitting into Ridge
from sklearn.preprocessing import Normalizer
normalizer = Normalizer().fit(X_train) 
X_train = normalizer.transform(X_train)
X_val = normalizer.transform(X_val)
X_test = normalizer.transform(X_test)

In [25]:
### fit Ridge regression
model_ridge = Ridge()
model_ridge.fit(X_train,y_train)
y_pred = model_ridge.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred))
print('validation RMSE: %d '%rmse_val)

validation RMSE: 189 


In [28]:
y_pred = model_ridge.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
print('training RMSE: %d'%rmse_train)

training RMSE: 5989
