In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import r2_score

%matplotlib inline

In [3]:
ames_df = pd.read_csv('../data/dummied_train.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/dummied_train.csv'

In [None]:
ames_df.head()

In [None]:
ames_df.info()

In [None]:
ames_df['saleprice']

### Model Prep: Create features matrix ( X ) and target vector ( y ) 

In [None]:
is_NaN = ames_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = ames_df[row_has_NaN]
rows_with_NaN

In [None]:
features = ames_df.drop('saleprice', axis=1)
X = features
print(X)

In [None]:
y = ames_df['saleprice']

### Model Prep: Train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

### Model Prep: Scaling

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

### Model Prep: Instantiate model

In [None]:
lr = LinearRegression()

In [None]:
lasso = LassoCV(n_alphas=200)

In [None]:
ridge = RidgeCV (alphas=np.linspace(.1, 10, 100))

### Cross validation

In [None]:
lr_scores = cross_val_score(lr, X_train, y_train, cv=5)
lr_scores.mean()

In [None]:
lasso_scores = cross_val_score(lasso, X_train, y_train, cv=5)
lasso_scores.mean()

In [None]:
ridge_scores = cross_val_score(ridge, X_train, y_train, cv=5)
ridge_scores.mean()

### Model Fitting and Evaluation

In [None]:
ridge.fit(X_train, y_train)

In [None]:
ridge.score(X_train, y_train)

In [None]:
ridge.score(X_test, y_test)

In [None]:
ridge_scores.mean()

In [None]:
y_pred = ridge.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

In [None]:
pd.Series(ridge.coef_, index=features).plot.bar(figsize=(15, 7));

In [None]:
residuals = y_test - y_pred

In [None]:
plt.scatter(y_pred, residuals);

In [None]:
metrics.mean_squared_error(y_test, y_pred)

In [None]:
y_train.mean()

In [None]:
y_test.mean()

In [None]:
y_pred.mean()

In [None]:
y_pred.shape

In [None]:
y_pred[y_pred > 500000]

In [None]:
y_test.shape

In [None]:
mse = ((y_test - y_pred)**2).sum() / len(y_test)

In [None]:
mse

In [None]:
#histogram of 'saleprice' from y_test 
plt.hist(y_test)

In [None]:
#histogram from 'saleprice' of y_pred
plt.hist(y_pred)

In [None]:
#from train data, drop 'saleprice' to get X columns
features = [col for col in ames_df._get_numeric_data().columns if col != 'saleprice']
X = ames_df[features]
y = ames_df['saleprice']

In [None]:
#predict 'saleprice' from X columns to get y_train_pred
y_train_pred = ridge.predict(X)

In [None]:
#find rmse of y_train and y_train_predict
metrics.mean_absolute_error(y, y_train_pred)