# Price Pridiction for Holiday Listings.csv
## One copy of the code for Non-holiday Listings.csv

In [1]:
# For Analysis
import numpy as np
import pandas as pd

# For Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# For Calculations
from math import floor

#For Modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression , Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
import os
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from catboost import CatBoostRegressor
from sklearn import preprocessing

# For Validation
from sklearn.metrics import mean_squared_error, accuracy_score

# For Storing Models
import pickle
%matplotlib inline

# For Warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
holiday_listings = pd.read_csv("..\dataset_filter\listings_holiday.csv")

# 1. Create Model Function: 
* train_set:test_set = 8:2
* Display R^2 and MSE for comparing different models

In [3]:
#Creating the train and test split
np.random.seed(2018)
train = np.random.choice([True, False], holiday_listings.shape[0], replace=True, p=[0.8, 0.2])
listings_train = holiday_listings.iloc[train,:]
listings_test = holiday_listings.iloc[~train,:]

In [5]:
# list(listings_train)

In [None]:
def model_listing(regr,train_cols,target_col):
    
    x_train = listings_train[train_cols].values
    x_test = listings_test[train_cols].values
    y_train = listings_train[target_col].values
    y_test = listings_test[target_col].values
    
    print("Shape of Train and Test data")
    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
    print(" ------------------------------------------ ")
    
    #Min Max Scaling

    #scaler = MinMaxScaler()
    #x_train = scaler.fit_transform(x_train)
    #x_test = scaler.transform(x_test)
    
    # Declare an instance of the Linear Regression model.
    rg = regr()

    # Fit the model on to the training data( Train the model ).
    rg.fit(x_train, y_train)
    
    # Use the model to predict values
    y_pred = rg.predict(x_train)

    # Calculate the Mean Squared Error using the mean_squared_error function.
    print("Training Data")
    print("R^2 value using score fn: %.3f" % rg.score(x_train,y_train))
    print("Mean Squared Error : %0.3f" % mean_squared_error(y_train,y_pred))
    print("Root Mean Squared Error : %0.3f" % (mean_squared_error(y_train,y_pred))**0.5)
    print(" ------------------------------------------ ")
    # Use the model to predict values
    y_pred = rg.predict(x_test)

    # Calculate the Mean Squared Error using the mean_squared_error function.
    print("Test Data")
    print("R^2 value using score fn: %.3f" % rg.score(x_test,y_test))
    print("Mean Squared Error : %0.3f" % mean_squared_error(y_test,y_pred))
    print("Root Mean Squared Error : %0.3f" % (mean_squared_error(y_test,y_pred)**0.5))
    print(" ------------------------------------------ ")
    #print(lm.intercept_, lm.coef_)
    
    lin_reg_coef = pd.DataFrame(list(zip(train_cols,(rg.coef_))),columns=['Feature','Coefficient'])
    print(lin_reg_coef.sort_values(by='Coefficient',ascending=False))
    print(" ------------------------------------------ ")
    
    # Plot of model's residuals:
    fig = plt.figure(figsize=(10,3))

    sns.regplot(y_test,y_pred)
    plt.title("Residuals for the model")

# Model 1: Linear Regresson + Basic Features
## Basic Features are analyzed in the EDA, which have a high corelation with Price

In [None]:
train_cols = [
    ## Need to wait for EDA part

]

target_col = 'price_holiday'
model_listing(LinearRegression,train_cols,target_col)

# Model 2: Linear Regresson + Basic Features + Amenities Features,  Linear regresson can fit in Boolean???
## Amenities are extracted by using One-Hot Encoding

In [None]:
train_cols = [
   
    
]

target_col = 'price_holiday'

model_listing(LinearRegression,train_cols,target_col)

# Model 3" Linear Regresson + Basic Features + Amenities Features + Seattle Score Features
## Seattle-Score Features are extracted from another Dataset,   .csv

In [None]:
train_cols = [
   
    
]

target_col = 'price_holiday'

model_listing(LinearRegression,train_cols,target_col)

# To solve overfitting, Regularization is applied......

In [None]:
# Function to calculate regularized cost given alpha, mse and the model coefficients
def reg_cost(alpha, mse, coeffs, model = None):
    if model == "lasso":
        return mse + alpha * np.sum(np.abs(coeffs))
    elif model == "ridge":
        return mse + alpha * np.linalg.norm(coeffs)
    else:
        return mse

In [None]:
alpha_levels = [0.001, 0.01, 0.1, 1, 10, 100]

x_train = listings_train[train_cols].values
x_test = listings_test[train_cols].values
y_train = listings_train[target_col].values
y_test = listings_test[target_col].values

for alpha_level in alpha_levels:
    print("\n At alpha Level: %0.2f "% alpha_level)

    lasso_lm = Lasso(alpha= alpha_level)

    # Fit the model on to the training data( Train the model ).
    lasso_lm.fit(x_train, y_train)

    # Use the model to predict values
    #y_pred = np.expm1(lm.predict(x_test))
    y_pred = lasso_lm.predict(x_test)

    # Calculate the Mean Squared Error using the mean_squared_error function.
    print("Test Data")
    print("R^2 value using score fn: %.3f" % lasso_lm.score(x_test,y_test))
    print("Mean Squared Error : %0.3f" % mean_squared_error(y_test,y_pred))
    print("Root Mean Squared Error : %0.3f" % (mean_squared_error(y_test,y_pred))**0.5)
       
    # Get model complexity using the user defined fn
    print("Model Complexity: %0.3f" % reg_cost(mse = 0, alpha = 1, coeffs= lasso_lm.coef_, model= "lasso"))
    
    # Get Regularized Cost using the user defined fn
    print("Regularized Cost: %0.3f" % reg_cost(mse = mean_squared_error(y_test,y_pred), alpha = alpha_level, coeffs= lasso_lm.coef_, model= "lasso"))

# Linear Regresson Finished......

# Model 4: Random Forest Regressor
## Fit in all Columns

In [None]:
train_cols = [   
]

target_col = 'price_holiday'

x_train = listings_train[train_cols].values
x_test = listings_test[train_cols].values
y_train = listings_train[target_col].values
y_test = listings_test[target_col].values

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

#Create a random forest regressor
clf = RandomForestRegressor(max_depth=10, n_estimators=100)

#Train the regressor
clf.fit(x_train, y_train)

#Plot variable importances for the top 10 predictors
importances = clf.feature_importances_
feat_names = train_cols
tree_result = pd.DataFrame({'feature': feat_names, 'importance': importances})
tree_result.sort_values(by='importance',ascending=False)[:10].plot(x='feature', y='importance', kind='bar',color='blue')

In [None]:
# Use the model to predict values
y_pred = clf.predict(x_train)

# Calculate the Mean Squared Error using the mean_squared_error function.
print("Training Data")
print("R^2 value using score fn: %.3f" % clf.score(x_train,y_train))
print("Mean Squared Error : %0.3f" % mean_squared_error(y_train,y_pred))
print("Root Mean Squared Error : %0.3f" % (mean_squared_error(y_train,y_pred))**0.5)


print(" ------------------------------------------ ")

# Use the model to predict values
y_pred = clf.predict(x_test)

# Calculate the Mean Squared Error using the mean_squared_error function.
print("Test Data")
print("R^2 value using score fn: %.3f" % clf.score(x_test,y_test))
print("Mean Squared Error : %0.3f" % mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error : %0.3f" % (mean_squared_error(y_test,y_pred))**0.5)

print(" ----------------------------------- ")

# Plot of model's residuals:
fig = plt.figure(figsize=(10,3))

sns.regplot((y_test),(y_pred))
plt.title("Residuals for the model")

# Model 5: Catboost Regressor 
## Fill in all columns

In [None]:
numerical_data = []
categorical_data = []
for column in holiday_listings.columns:
    if holiday_listings[column].dtype == "object":
        categorical_data.append(column)
    else:
        numerical_data.append(column)

# prepare categorical features indices, catboost needs the indices of caterical data

In [None]:
def column_index(df, query_cols):
    cols = listings_2.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, query_cols, sorter=sidx)]

categorical_feature_indices = column_index(X, categorical_data)

In [None]:
categorical_feature_indices

In [None]:
train_cols = [


]

target_col = 'price_holiday'
x_train = listings_train[train_cols].values
x_test = listings_test[train_cols].values
y_train = listings_train[target_col].values
y_test = listings_test[target_col].values
    
print("Shape of Train and Test data")
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
print(" ------------------------------------------ ")

In [2]:
# refinement
model =  CatBoostRegressor(iterations=700,
                             learning_rate=0.01,
                             depth=4,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 75,
                             od_wait=100)
model.fit(x_train, y_train,
                 eval_set=(x_test, y_test),
                 cat_features=categorical_feature_indices,
                 use_best_model=True,plot=True)

In [None]:
# Calculate the Mean Squared Error using the mean_squared_error function.
print("Training Data")
print("R^2 value using score fn: %.3f" % model.score(x_train,y_train))
print("Root Mean Squared Error : %0.3f" % model.get_best_score()['learn']['RMSE'])


print(" ------------------------------------------ ")

# Calculate the Mean Squared Error using the mean_squared_error function.
print("Test Data")
print("R^2 value using score fn: %.3f" % model.score(x_test,y_test))
print("Root Mean Squared Error : %0.3f" % (model.get_best_score()['validation']['RMSE'])

print(" ----------------------------------- ")

In [None]:
fea_imp = pd.DataFrame({'imp': model.feature_importances_, 'col': holiday_listings.columns})
fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
fea_imp.plot(kind='barh', x='col', y='imp', figsize=(10, 15), legend=None)
plt.title('CatBoost - Feature Importance')
plt.ylabel('Features')
plt.xlabel('Importance');

In [None]:
fea_imp

# Models Comparison: Select the best model by comparing R^2 and RMSE
*
*
*
*


# Model Results: Use Best Model to predict
*
*
*
*
