# Methodology

### 1. House
- MSSubClass (cat)
- Utilities (cat)
- BldgType (cat)
- HouseStyle (cat)
- OverallQual
- OverallCond
- YearBuilt
- YearRemoAdd
- RoofStyle (cat)
- Exterior1st (cat)
- Exterior2nd (cat)
- ExterQual (cat)
- ExterCond (cat)
- Foundation (cat)

### 2. Location
- MSZoning (cat): Identifies the general zoning classification of the sale
- Neighborhood (cat)
- Condition1 (cat)
- Condition2 (cat)
- 

### 3. Lot
- LotArea: Lot size in square feet
- Street (cat): Type of road access to property
- LandContour (cat)
- LotConfig (cat)
- LandSlope (cat)
- 

In [201]:
import os
import datetime

import matplotlib as mpl
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)


from scipy.stats import pearsonr
# from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

In [218]:
class Analysis(object):
    """
    """
    def __init__(self):
        cwd = os.getcwd()
        kaggle = os.path.join(cwd,'kaggle')
        
        with open(os.path.join(kaggle,'data_description.txt'),'r') as f:
            self.data_description = f.read()
        self.df_input_train = pd.read_csv(os.path.join(kaggle,'train.csv'),header=0,index_col=0)
        self.df_input_test = pd.read_csv(os.path.join(kaggle,'test.csv'),header=0,index_col=0).assign(SalePrice=np.nan)
        self.df_output_sample = pd.read_csv(os.path.join(kaggle,'sample_submission.csv'),header=0,index_col=0)
        
        self.df_train = None
        self.df_test = None
        self.prediction_data = {'train': None,'test': None}
        
        self.df_output = None
        self.output_file = os.path.join(cwd,'housing_analysis-tsj7ww-TrevorJordan.csv')
        
    def clean(self):
        cleaner = Cleaner(self.df_input_train)
        self.df_train = cleaner.df
        self.prediction_data['train'] = cleaner.prediction_data
                                         
        cleaner = Cleaner(self.df_input_test,test=True)
        self.df_test = cleaner.df
        self.prediction_data['test'] = cleaner.prediction_data
    
    def predict(self):
        predictor = Predictor(self.prediction_data)
        self.df_output = predictor.df_output
    
    def wrap_up(self):
        self.df_output.to_csv(output_file,header=True)
        

In [222]:
class Cleaner(object):
    """
    """
    def __init__(self,df,test=False):
        self.df_input = df
        self.test = test
        self.df = None
        self.prediction_data = {
            'X': None,
            'y': None,
        }
        
        self.clean()
        self.wrangle()
        
    def clean(self):
        df = self.df_input
        df = df#[['YearBuilt','LotArea','SalePrice']]
        
        self.df = df
    
    def wrangle(self):
        df = self.df
        if not self.test:
            self.prediction_data['fit'] = {}
            self.prediction_data['score'] = {}
        self.prediction_data['y'] = df.SalePrice
        df = df.drop(['SalePrice'],axis=1)
        self.prediction_data['X'] = df

In [223]:
class Predictor(object):
    """
    """
    def __init__(self,prediction_data):
        self.prediction_data = prediction_data
        
        self.model = None
        self.df_output = None
        
        self.split()
        self.normalize()
        self.train()
        self.assess()
        self.split()
    
    def split(self):
        (self.prediction_data['train']['fit']['X'],
        self.prediction_data['train']['score']['X'],
        self.prediction_data['train']['fit']['y'],
        self.prediction_data['train']['score']['y']) = train_test_split(
            self.prediction_data['train']['X'],
            self.prediction_data['train']['y'],
            test_size=0.2, random_state=42
        )
    
    def normalize(self):
        sc = StandardScaler()
        self.prediction_data['train']['fit']['X'] = sc.fit_transform(self.prediction_data['train']['fit']['X'])
        self.prediction_data['train']['score']['X'] = sc.fit_transform(self.prediction_data['train']['score']['X'])
        
    def train(self):
        gbr_params = {
            'n_estimators': 1000,
            'max_depth': 3,
            'min_samples_split': 5,
            'learning_rate': 0.01,
            'loss': 'squared_error',
        }
        model = GradientBoostingRegressor(**gbr_params)
        model.fit(**self.prediction_data['train']['fit'])
        
        self.model = model
        
    def assess(self):
        corr = self.model.score(**self.prediction_data['train']['score'])
        print('Model Accuracy:',round(corr,4))
        mse = mean_squared_error(self.prediction_data['train']['score']['y'], self.model.predict(self.prediction_data['train']['score']['X']))
        print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
        
    def predict(self):
        pred = self.model.predict(self.prediction_data['test']['X'])
        self.prediction_data['test']['y'] = pd.DataFrame(pred,columns=['SalePrice'])

In [224]:
analysis = Analysis()

analysis.clean()
analysis.predict()
# analysis.wrap_up()

ValueError: could not convert string to float: 'RL'

In [194]:
class Analyzer(object):
    def __init__(self,df):
        self.df = df
    
    def eda(self):
        print(self.df.columns)
       #print(self.df.head())
        for cname,value in self.df.iloc[0,:].to_dict().items():
            column = self.df.loc[:,cname]
            print(cname,': ',value,type(value),len(column.unique()))
            if len(column.unique()) < 20:
                print(dict(column.value_counts()))
        
    def correlate(self):
        None
        
    def plot(self):
        None

analyzer = Analyzer(analysis.df_input_train)

In [None]:
analyzer.correlate()

In [None]:
analyzer.plot()

In [195]:
analyzer.eda()

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo