Predicting Rental Prices in New York

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import data_cleaning as dc
from sklearn import metrics
%matplotlib inline

from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.model_selection import train_test_split

In [39]:
#importing dataframes and cleaning
df = pd.read_csv('data/products_full.csv')
df.drop(columns = ['Unnamed: 0', 'review_comment_message', ], inplace=True)


  interactivity=interactivity, compiler=compiler, result=result)


In [43]:
df.drop(columns= [df.columns[23]], inplace=True)

In [38]:
df['product_length_cm'].astype('int64')

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [44]:
#create list of continuous variables
cont_features = []

for colname, coltype in df.dtypes.iteritems():
    if coltype in [np.float64, np.int64]:
        cont_features.append(colname)
        
cont_features

['payment_installments',
 'payment_value',
 'review_score',
 'customer_zip_code_prefix',
 'price',
 'freight_value',
 'product_name_lenght',
 'product_description_lenght',
 'product_photos_qty',
 'product_weight_g',
 'product_length_cm',
 'product_height_cm',
 'product_width_cm',
 'seller_zip_code_prefix']

In [45]:
#create list of categorical variables
cat_features = []

for colname, coltype in df.dtypes.iteritems():
    if coltype in [np.object]:
        cat_features.append(colname)
        
cat_features

['order_id',
 'customer_id',
 'order_status',
 'order_purchase_timestamp',
 'order_approved_at',
 'order_delivered_carrier_date',
 'order_delivered_customer_date',
 'order_estimated_delivery_date',
 'payment_sequential',
 'payment_type',
 'review_id',
 'review_comment_title',
 'review_creation_date',
 'review_answer_timestamp',
 'customer_unique_id',
 'customer_city',
 'customer_state',
 'order_item_id',
 'product_id',
 'shipping_limit_date',
 'product_category_name',
 'product_category_name_english',
 'seller_city',
 'seller_state']

In [46]:
# Create X and y then split in train and test
features = [col for col in df.columns if col != 'review_score']
X = df.loc[:, features]
y = df.loc[:, 'review_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

#Create X test and X train from continuous variables
X_train_cont = X_train.loc[:, cont_features]
X_test_cont = X_test.loc[:, cont_features]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [47]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer

# Impute missing values with median using Imputer from sklearn.preprocessing
impute = Imputer(strategy='median')
impute.fit(X_train_cont)

X_train_imputed = impute.transform(X_train_cont)

# Fit the model and print R2 and MSE for train and test
linreg = LinearRegression()
linreg.fit(X_train_imputed, y_train)

print('Training r^2:', linreg.score(X_train_imputed, y_train))
print('Training MSE:', mean_squared_error(y_train, linreg.predict(X_train_imputed)))



Training r^2: 0.016350603889926374
Training MSE: 1.9850140164952528


In [48]:
from sklearn.preprocessing import StandardScaler

# Scale the train and test data
ss = StandardScaler()
ss.fit(X_train_imputed)

X_train_imputed_scaled = ss.transform(X_train_imputed)

In [49]:
linreg_norm = LinearRegression()
linreg_norm.fit(X_train_imputed_scaled, y_train)

print('Training r^2:', linreg_norm.score(X_train_imputed_scaled, y_train))
print('Training MSE:', mean_squared_error(y_train, linreg_norm.predict(X_train_imputed_scaled)))

Training r^2: 0.016350603889926374
Training MSE: 1.9850140164952528


In [50]:
# Create X_cat which contains only the categorical variables
X_train_cat = X_train.loc[:, cat_features]

#Fill nans with a value indicating that that it is missing
X_train_cat.fillna(value='missing', inplace=True)

In [51]:
from sklearn.preprocessing import OneHotEncoder

# OneHotEncode Categorical variables
ohe = OneHotEncoder()
X_train_ohe = ohe.fit_transform(X_train_cat)

# columns = ohe.get_feature_names(input_features=X_train_cat.columns)
# cat_train_df = pd.DataFrame(X_train_ohe.todense(), columns=columns)
# cat_test_df = pd.DataFrame(X_test_ohe.todense(), columns=columns)


TypeError: argument must be a string or number

In [None]:
X_train_all = pd.concat([pd.DataFrame(X_train_imputed_scaled), cat_train_df], axis = 1)
X_test_all = pd.concat([pd.DataFrame(X_test_imputed_scaled), cat_test_df], axis = 1)

In [None]:
linreg_all = LinearRegression()
linreg_all.fit(X_train_all, y_train)

print('Training r^2:', linreg_all.score(X_train_all, y_train))
print('Testing r^2:', linreg_all.score(X_test_all, y_test))
print('Training MSE:', mean_squared_error(y_train, linreg_all.predict(X_train_all)))
print('Testing MSE:', mean_squared_error(y_test, linreg_all.predict(X_test_all)))

Next Steps:
- Data Cleaning / Proper Join
- Ridge / Lasso / Elastic
- Lamdba / Alpha
- Feature re-selection if low R2 after cleaning
- Cross-validation
- Test Data!