Predicting Rental Prices in New York

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import data_cleaning as dc
from sklearn import metrics
%matplotlib inline

from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.model_selection import train_test_split

%matplotlib inline
%load_ext autoreload
%autoreload

In [2]:
df = pd.read_csv('data/products_new.csv')
df.dropna(inplace=True)
df.drop(columns='Unnamed: 0', inplace=True)

In [18]:
cont_features = []

for colname, coltype in df.dtypes.iteritems():
    if coltype in [np.float64, np.int64]:
        cont_features.append(colname)

In [19]:
# Create X and y then split in train and test
features = [col for col in df.columns if col != 'price']
X = df.loc[:, features]
y = df.loc[:, 'price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

#Create X test and X train from continuous variables
X_train_cont = X_train.loc[:, cont_features]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [20]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer

# Impute missing values with median using Imputer from sklearn.preprocessing
impute = Imputer(strategy='median')
impute.fit(X_train_cont)

X_train_imputed = impute.transform(X_train_cont)

# Fit the model and print R2 and MSE for train and test
linreg = LinearRegression()
linreg.fit(X_train_imputed, y_train)

print('Training r^2:', linreg.score(X_train_imputed, y_train))
print('Training MSE:', mean_squared_error(y_train, linreg.predict(X_train_imputed)))



Training r^2: 0.5736398650621055
Training MSE: 14276.482958054907


In [21]:
from sklearn.preprocessing import StandardScaler

# Scale the train and test data
ss = StandardScaler()
ss.fit(X_train_imputed)

X_train_imputed_scaled = ss.transform(X_train_imputed)

In [22]:
linreg_norm = LinearRegression()
linreg_norm.fit(X_train_imputed_scaled, y_train)

print('Training r^2:', linreg_norm.score(X_train_imputed_scaled, y_train))
print('Training MSE:', mean_squared_error(y_train, linreg_norm.predict(X_train_imputed_scaled)))

Training r^2: 0.5736398650621057
Training MSE: 14276.482958054901


In [23]:
alphas = [1,2,3]

from sklearn.linear_model import Ridge

def ridge(list):
    for x in list:
        # Train model setting alpha (lambda) to 0.05
        ridge = Ridge(alpha=x, normalize=True)
        #Fit Ridge model to training data
        ridge.fit(X_train_imputed_scaled, y_train)
        y_predict_ridge = ridge.predict(X_train_imputed_scaled)
        # Calculate R^2 and mse
        print('Training r^2:',ridge.score(X_train_imputed_scaled, y_train))
        print('Training MSE:',mean_squared_error(y_train,y_predict_ridge))
        
ridge(alphas)

Training r^2: 0.4627161397342039
Training MSE: 17990.715468367875
Training r^2: 0.37139593247614733
Training MSE: 21048.532735537083
Training r^2: 0.31018554079001426
Training MSE: 23098.13597503844


In [24]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.05, normalize=True)
lasso.fit(X_train_imputed_scaled, y_train)
y_predict_lasso = lasso.predict(X_train_imputed_scaled)
# calculating mse
print('Training MSE:',mean_squared_error(y_train,y_predict_lasso))
print('Training r^2:',lasso.score(X_train_imputed_scaled, y_train))

Training MSE: 14969.627770339375
Training r^2: 0.552939436492591


In [25]:
from sklearn.linear_model import ElasticNet

elastic = ElasticNet(alpha=0.05, l1_ratio=0.5, normalize=False)
elastic.fit(X_train_imputed_scaled,y_train)
y_predict_elastic = elastic.predict(X_train_imputed_scaled)
#calculating mse
print('Training MSE:', mean_squared_error(y_train, y_predict_elastic))
print('Training r^2:', elastic.score(X_train_imputed_scaled,y_train))

Training MSE: 14286.258427592744
Training r^2: 0.5733479254770215


In [26]:
len(lasso.coef_), len(X_train_cont.columns)

(10, 11)

ValueError: Grouper and axis must be same length

In [None]:
X.columns

In [None]:
df.head()