Predicting Rental Prices in New York

In [23]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import data_cleaning as dc
from sklearn import metrics
%matplotlib inline

from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.model_selection import train_test_split

%matplotlib inline
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
df = pd.read_csv('data/products_new.csv')

In [48]:
cont_features = []

for colname, coltype in df.dtypes.iteritems():
    if coltype in [np.float64, np.int64]:
        cont_features.append(colname)

cat_features = []
for colname, coltype in df.dtypes.iteritems():
    if coltype in [np.object]:
        cat_features.append(colname)

In [49]:
# Create X and y then split in train and test
features = [col for col in df.columns if col != 'review_score']
X = df_con.loc[:, features]
y = df.loc[:, 'review_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

#Create X test and X train from continuous variables
X_train_cont = X_train.loc[:, cont_features]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


ValueError: Found input variables with inconsistent numbers of samples: [114062, 116584]

In [None]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer

# Impute missing values with median using Imputer from sklearn.preprocessing
impute = Imputer(strategy='median')
impute.fit(X_train_cont)

X_train_imputed = impute.transform(X_train_cont)

# Fit the model and print R2 and MSE for train and test
linreg = LinearRegression()
linreg.fit(X_train_imputed, y_train)

print('Training r^2:', linreg.score(X_train_imputed, y_train))
print('Training MSE:', mean_squared_error(y_train, linreg.predict(X_train_imputed)))

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale the train and test data
ss = StandardScaler()
ss.fit(X_train_imputed)

X_train_imputed_scaled = ss.transform(X_train_imputed)

In [50]:
linreg_norm = LinearRegression()
linreg_norm.fit(X_train_imputed_scaled, y_train)

print('Training r^2:', linreg_norm.score(X_train_imputed_scaled, y_train))
print('Training MSE:', mean_squared_error(y_train, linreg_norm.predict(X_train_imputed_scaled)))

Training r^2: 0.017510362913710775
Training MSE: 1.8097080914181378


In [41]:
# Create X_cat which contains only the categorical variables
X_train_cat = X_train.loc[:, cat_features]

#Fill nans with a value indicating that that it is missing
X_train_cat.fillna(value='missing', inplace=True)

In [42]:
# # Create X and y then split in train and test
# features = [col for col in df.columns if col != 'review_score']
# X = df_cat.loc[:, features]
# y = df.loc[:, 'review_score']

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

# #Create X test and X train from continuous variables
# X_train_cat = X_train.loc[:, cont_features]
# X_test_cat = X_test.loc[:, cont_features]

In [43]:
from sklearn.preprocessing import OneHotEncoder

# OneHotEncode Categorical variables
ohe = OneHotEncoder()
X_train_ohe = ohe.fit_transform(X_train_cat)

columns = ohe.get_feature_names(input_features=X_train_cat.columns)
cat_train_df = pd.DataFrame(X_train_ohe.todense(), columns=columns)



In [46]:
len(X_train_imputed_scaled)

85546

In [33]:
X_train_all = pd.concat([pd.DataFrame(X_train_imputed_scaled), cat_train_df], axis = 1)

In [34]:
linreg_all = LinearRegression()
linreg_all.fit(X_train_all, y_train)

print('Training r^2:', linreg_all.score(X_train_all, y_train))
print('Training MSE:', mean_squared_error(y_train, linreg_all.predict(X_train_all)))

Training r^2: 0.017510362913710775
Training MSE: 1.8097080914181378


Next Steps:
- Data Cleaning / Proper Join
- Ridge / Lasso / Elastic
- Lamdba / Alpha
- Feature re-selection if low R2 after cleaning
- Cross-validation
- Test Data!

In [18]:
alphas = [0.05,0.5,0.1,0.01]

from sklearn.linear_model import Ridge

def ridge(list):
    for x in list:
        # Train model setting alpha (lambda) to 0.05
        ridge = Ridge(alpha=x, normalize=True)
        #Fit Ridge model to training data
        ridge.fit(X_train_all, y_train)
        y_predict_ridge = ridge.predict(X_train_all)
        # Calculate R^2 and mse
        print('Training r^2:',ridge.score(X_train_all, y_train))
        print('Training MSE:',mean_squared_error(y_train,y_predict_ridge))
        
ridge(alphas)

Training r^2: 0.0171554290678132
Training MSE: 1.8103618658994076
Training r^2: 0.011318376429541921
Training MSE: 1.8211134921667749
Training r^2: 0.016449626696593778
Training MSE: 1.8116619266979377
Training r^2: 0.017491777213240467
Training MSE: 1.8097423255629634


In [21]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.9, normalize=True)
lasso.fit(X_train_all, y_train)
y_predict_lasso = lasso.predict(X_train_all)
# calculating mse
print('Training MSE:',mean_squared_error(y_train,y_predict_lasso))
print('Training r^2:',lasso.score(X_train_all, y_train))

Training MSE: 1.8419615058588108
Training r^2: 0.0


In [22]:
from sklearn.linear_model import ElasticNet

elastic = ElasticNet(alpha=0.05, l1_ratio=0.5, normalize=False)
elastic.fit(X_train_all,y_train)
y_predict_elastic = elastic.predict(X_train_all)
#calculating mse
print('Training MSE:', mean_squared_error(y_train, y_predict_elastic))
print('Training r^2:', elastic.score(X_train_all,y_train))

Training MSE: 1.8180722870165749
Training r^2: 0.01296944521709631
