# Project 2 Kaggle Submission Process

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.dummy import DummyRegressor

## Imports and Data Cleaning

In [4]:
train = pd.read_csv('./datasets/train.csv')
test = pd.read_csv('./datasets/test.csv')

train_ids = train['Id']
test_ids = test['Id']

In [5]:
train.isnull().sum()[train.isnull().sum() == 0]

Id                 0
PID                0
MS SubClass        0
MS Zoning          0
Lot Area           0
Street             0
Lot Shape          0
Land Contour       0
Utilities          0
Lot Config         0
Land Slope         0
Neighborhood       0
Condition 1        0
Condition 2        0
Bldg Type          0
House Style        0
Overall Qual       0
Overall Cond       0
Year Built         0
Year Remod/Add     0
Roof Style         0
Roof Matl          0
Exterior 1st       0
Exterior 2nd       0
Exter Qual         0
Exter Cond         0
Foundation         0
Heating            0
Heating QC         0
Central Air        0
Electrical         0
1st Flr SF         0
2nd Flr SF         0
Low Qual Fin SF    0
Gr Liv Area        0
Full Bath          0
Half Bath          0
Bedroom AbvGr      0
Kitchen AbvGr      0
Kitchen Qual       0
TotRms AbvGrd      0
Functional         0
Fireplaces         0
Paved Drive        0
Wood Deck SF       0
Open Porch SF      0
Enclosed Porch     0
3Ssn Porch   

In [6]:
test.isnull().sum()[['Overall Qual','Year Built']]

Overall Qual    0
Year Built      0
dtype: int64

## Creating features

In [7]:
X = train[['Overall Qual', 'Year Built']]
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y)

## Before Modeling: Baseline model
aka dummy model, aka null model.

First, manually:

In [8]:
# manual baseline model
# predict only the mean of y_train
dummy_train_preds = [np.mean(y_train)] * len(y_train) #list multiplication to get proper length
#dummy_train_preds is list the length of y_train, all the same value
dummy_test_preds = [np.mean(y_train)] * len(y_test) #mean is always of data we've SEEN (TRAIN)

In [9]:
r2_score(y_train, dummy_train_preds), r2_score(y_test, dummy_test_preds)
#train r2 is 0 because there IS no variation in predictions (we set it to mean)
#model is explaining NONE of the variance, ergo r2=0

(0.0, -0.009664316355580294)

In [11]:
mean_squared_error(y_train, dummy_train_preds, squared=False),mean_squared_error(y_test, dummy_test_preds, squared=False) #squared = False = RMSE

(77098.90381716218, 85437.63661235073)

Now with DummyRegressor tool

In [21]:
dummy_model=DummyRegressor()

In [22]:
dummy_model.fit(X_train, y_train)

DummyRegressor()

In [23]:
#dummy_model.predict(y_test)   #same as what we did manually
#interested in .predict bc now we can get r2score
r2_score(y_train, dummy_model.predict(y_train)), r2_score(y_test, dummy_model.predict(y_test))

(0.0, -0.009664316355580294)

In [25]:
mean_squared_error(y_train, dummy_model.predict(y_train),squared=False), mean_squared_error(y_test, dummy_model.predict(y_test), squared=False)
#same output as manual method
#easy to compare to real model outputs

(77098.90381716218, 85437.63661235073)

## Modeling

In [12]:
lr = LinearRegression()

lr.fit(X_train,y_train)

print(lr.score(X_train,y_train), lr.score(X_test, y_test))

# test_preds = lr.predict(X_test) DON'T DO THIS

test_preds = lr.predict(test[['Overall Qual','Year Built']])

0.6415417471759908 0.6760323480189618


In [15]:
#rmse for lr.  #way better than dummy.
mean_squared_error(y_train, lr.predict(X_train), squared=False), mean_squared_error(y_test, lr.predict(X_test), squared=False)

(46160.180159985066, 48396.2316298934)

In [7]:
kaggle_submission = {
    'Id': test_ids,
    'SalePrice': test_preds
}

kaggle_submission = pd.DataFrame(kaggle_submission)

In [8]:
kaggle_submission.head(20)

Unnamed: 0,Id,SalePrice
0,2658,152198.422648
1,2718,138665.603557
2,2414,230113.455011
3,1989,117336.526768
4,625,173132.516534
5,333,96694.015414
6,1327,91164.254765
7,858,138665.603557
8,95,229323.489204
9,1568,178662.277183


In [9]:
kaggle_submission.to_csv('./datasets/kaggle_submission_01.csv', index=False)

## Demo of why DummyRegressor is Convenient

In [28]:
lr = LinearRegression()
dummy_model = DummyRegressor()

models = [lr, dummy_model]
model_names = ['LinReg', 'Dummy']

results = {}

for model, model_name in zip(models, model_names):
    model = model.fit(X_train, y_train)
    results[model_name] = {model_name:model}
    
    results[model_name]['test_r2'] = [r2_score(y_test, model.predict(X_test))]
    results[model_name]['test_rmse'] = [mean_squared_error(y_test, model.predict(X_test), squared=False)]

In [29]:
pd.DataFrame(results['LinReg'])

Unnamed: 0,LinReg,test_r2,test_rmse
0,LinearRegression(),0.676032,48396.23163


In [31]:
results = pd.concat([pd.DataFrame(results['LinReg']), pd.DataFrame(results['Dummy'])])

In [32]:
results

Unnamed: 0,LinReg,test_r2,test_rmse,Dummy
0,LinearRegression(),0.676032,48396.23163,
0,,-0.009664,85437.636612,DummyRegressor()


In [37]:
plt.bar(results['model_names'], results['test_rmse'])

KeyError: 'model_names'