Predicting Rental Prices in New York

In [10]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import data_cleaning as dc
from sklearn import metrics
%matplotlib inline

from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.model_selection import train_test_split

%matplotlib inline
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
cont_features = []

for colname, coltype in df.dtypes.iteritems():
    if coltype in [np.float64, np.int64]:
        cont_features.append(colname)

cat_features = []
for colname, coltype in df.dtypes.iteritems():
    if coltype in [np.object]:
        cat_features.append(colname)

In [8]:
df = dc.products_df()
df_cat = dc.cat_df()
df_con = dc.con_df()

In [9]:
# Create X and y then split in train and test
features = [col for col in df.columns if col != 'review_score']
X = df_con.loc[:, features]
y = df.loc[:, 'review_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

#Create X test and X train from continuous variables
X_train_cont = X_train.loc[:, cont_features]

In [10]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer

# Impute missing values with median using Imputer from sklearn.preprocessing
impute = Imputer(strategy='median')
impute.fit(X_train_cont)

X_train_imputed = impute.transform(X_train_cont)

# Fit the model and print R2 and MSE for train and test
linreg = LinearRegression()
linreg.fit(X_train_imputed, y_train)

print('Training r^2:', linreg.score(X_train_imputed, y_train))
print('Training MSE:', mean_squared_error(y_train, linreg.predict(X_train_imputed)))



Training r^2: 0.017510362913710775
Training MSE: 1.8097080914181378


In [11]:
from sklearn.preprocessing import StandardScaler

# Scale the train and test data
ss = StandardScaler()
ss.fit(X_train_imputed)

X_train_imputed_scaled = ss.transform(X_train_imputed)

In [12]:
linreg_norm = LinearRegression()
linreg_norm.fit(X_train_imputed_scaled, y_train)

print('Training r^2:', linreg_norm.score(X_train_imputed_scaled, y_train))
print('Training MSE:', mean_squared_error(y_train, linreg_norm.predict(X_train_imputed_scaled)))

Training r^2: 0.017510362913710775
Training MSE: 1.8097080914181378


In [13]:
# Create X_cat which contains only the categorical variables
X_train_cat = X_train.loc[:, cat_features]

#Fill nans with a value indicating that that it is missing
X_train_cat.fillna(value='missing', inplace=True)

In [14]:
# # Create X and y then split in train and test
# features = [col for col in df.columns if col != 'review_score']
# X = df_cat.loc[:, features]
# y = df.loc[:, 'review_score']

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

# #Create X test and X train from continuous variables
# X_train_cat = X_train.loc[:, cont_features]
# X_test_cat = X_test.loc[:, cont_features]

In [15]:
from sklearn.preprocessing import OneHotEncoder

# OneHotEncode Categorical variables
ohe = OneHotEncoder()
X_train_ohe = ohe.fit_transform(X_train_cat)

columns = ohe.get_feature_names(input_features=X_train_cat.columns)
cat_train_df = pd.DataFrame(X_train_ohe.todense(), columns=columns)

In [16]:
X_train_all = pd.concat([pd.DataFrame(X_train_imputed_scaled), cat_train_df], axis = 1)

In [17]:
linreg_all = LinearRegression()
linreg_all.fit(X_train_all, y_train)

print('Training r^2:', linreg_all.score(X_train_all, y_train))
print('Training MSE:', mean_squared_error(y_train, linreg_all.predict(X_train_all)))

Training r^2: 0.017510362913710775
Training MSE: 1.8097080914181378


Next Steps:
- Data Cleaning / Proper Join
- Ridge / Lasso / Elastic
- Lamdba / Alpha
- Feature re-selection if low R2 after cleaning
- Cross-validation
- Test Data!

In [18]:
alphas = [0.05,0.5,0.1,0.01]

from sklearn.linear_model import Ridge

def ridge(list):
    for x in list:
        # Train model setting alpha (lambda) to 0.05
        ridge = Ridge(alpha=x, normalize=True)
        #Fit Ridge model to training data
        ridge.fit(X_train_all, y_train)
        y_predict_ridge = ridge.predict(X_train_all)
        # Calculate R^2 and mse
        print('Training r^2:',ridge.score(X_train_all, y_train))
        print('Training MSE:',mean_squared_error(y_train,y_predict_ridge))
        
ridge(alphas)

Training r^2: 0.0171554290678132
Training MSE: 1.8103618658994076
Training r^2: 0.011318376429541921
Training MSE: 1.8211134921667749
Training r^2: 0.016449626696593778
Training MSE: 1.8116619266979377
Training r^2: 0.017491777213240467
Training MSE: 1.8097423255629634


In [19]:
df.head()

Unnamed: 0.1,Unnamed: 0,order_id,customer_id,order_purchase_timestamp,order_delivered_customer_date,order_estimated_delivery_date,payment_type,payment_value,review_id,review_score,...,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english
0,0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,2017-10-02 10:56:33,2017-10-10 21:25:13,2017-10-18 00:00:00,credit_card,18.12,a54f0611adc9ed256b57ede6b6eb5114,4.0,...,29.99,8.72,40.0,268.0,4.0,500.0,19.0,8.0,13.0,housewares
1,1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,2017-10-02 10:56:33,2017-10-10 21:25:13,2017-10-18 00:00:00,voucher,18.59,a54f0611adc9ed256b57ede6b6eb5114,4.0,...,29.99,8.72,40.0,268.0,4.0,500.0,19.0,8.0,13.0,housewares
2,2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,2017-10-02 10:56:33,2017-10-10 21:25:13,2017-10-18 00:00:00,voucher,2.0,a54f0611adc9ed256b57ede6b6eb5114,4.0,...,29.99,8.72,40.0,268.0,4.0,500.0,19.0,8.0,13.0,housewares
3,3,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,2018-07-24 20:41:37,2018-08-07 15:27:45,2018-08-13 00:00:00,boleto,141.46,8d5266042046a06655c8db133d120ba5,4.0,...,118.7,22.76,29.0,178.0,1.0,400.0,19.0,13.0,19.0,perfumery
4,4,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,2018-08-08 08:38:49,2018-08-17 18:06:29,2018-09-04 00:00:00,credit_card,179.12,e73b67b67587f7644d5bd1a52deb1b01,5.0,...,159.9,19.22,46.0,232.0,1.0,420.0,24.0,19.0,21.0,auto


In [20]:
df.corr()

Unnamed: 0.1,Unnamed: 0,payment_value,review_score,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
Unnamed: 0,1.0,-0.00221,-0.007489,-0.005671,0.000419,0.005376,-0.000921,-0.002211,-0.001028,-0.005077,0.002089,-0.003421
payment_value,-0.00221,1.0,-0.083847,0.736308,0.37242,0.004442,0.157024,0.010345,0.305908,0.138351,0.216861,0.148583
review_score,-0.007489,-0.083847,1.0,0.001919,-0.034327,-0.01291,0.01381,0.021511,-0.027431,-0.020876,-0.023722,-0.012353
price,-0.005671,0.736308,0.001919,1.0,0.415072,0.019803,0.200807,0.053276,0.340069,0.142927,0.223053,0.172982
freight_value,0.000419,0.37242,-0.034327,0.415072,1.0,0.025206,0.095025,0.02294,0.61225,0.303786,0.39157,0.322453
product_name_lenght,0.005376,0.004442,-0.01291,0.019803,0.025206,1.0,0.090435,0.145093,0.022394,0.062401,-0.030337,0.063369
product_description_lenght,-0.000921,0.157024,0.01381,0.200807,0.095025,0.090435,1.0,0.119175,0.059621,0.003377,0.073767,-0.069414
product_photos_qty,-0.002211,0.010345,0.021511,0.053276,0.02294,0.145093,0.119175,1.0,0.022569,0.046313,-0.033781,0.007771
product_weight_g,-0.001028,0.305908,-0.027431,0.340069,0.61225,0.022394,0.059621,0.022569,1.0,0.459272,0.582536,0.506058
product_length_cm,-0.005077,0.138351,-0.020876,0.142927,0.303786,0.062401,0.003377,0.046313,0.459272,1.0,0.188414,0.534786


In [21]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.9, normalize=True)
lasso.fit(X_train_all, y_train)
y_predict_lasso = lasso.predict(X_train_all)
# calculating mse
print('Training MSE:',mean_squared_error(y_train,y_predict_lasso))
print('Training r^2:',lasso.score(X_train_all, y_train))

Training MSE: 1.8419615058588108
Training r^2: 0.0


In [22]:
from sklearn.linear_model import ElasticNet

elastic = ElasticNet(alpha=0.05, l1_ratio=0.5, normalize=False)
elastic.fit(X_train_all,y_train)
y_predict_elastic = elastic.predict(X_train_all)
#calculating mse
print('Training MSE:', mean_squared_error(y_train, y_predict_elastic))
print('Training r^2:', elastic.score(X_train_all,y_train))

Training MSE: 1.8180722870165749
Training r^2: 0.01296944521709631
