In [163]:
# System
import os
import datetime

# Data Analysis
import re
import matplotlib as mpl
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

In [219]:
class Analysis(object):
    """
    """
    def __init__(self):
        cwd = os.getcwd()
        kaggle = os.path.join(cwd,'kaggle')
        
        with open(os.path.join(kaggle,'data_description.txt'),'r') as f:
            self.data_description = f.read()
        self.df_input_train = pd.read_csv(os.path.join(kaggle,'train.csv'),header=0,index_col=0)
        self.df_input_test = pd.read_csv(os.path.join(kaggle,'test.csv'),header=0,index_col=0).assign(SalePrice=np.nan)
        self.df_output_sample = pd.read_csv(os.path.join(kaggle,'sample_submission.csv'),header=0,index_col=0)
        
        self.train_cleaner = None
        self.test_cleaner = None
        self.predictor = None
        
        self.df_train = None
        self.df_test = None
        self.prediction_data = {'train': None,'test': None}
        
        self.df_output = None
        self.output_file = os.path.join(cwd,'housing_analysis-tsj7ww-TrevorJordan.csv')
        
        with open(os.path.join(cwd,'features.md'),'r') as f:
            self.features_md = f.read()
        self.features = None
        self.topics = None
    
        self._get_features()
    
    def _get_features(self):
        topics = {'House':[],'Lot':[],'Location':[],'Sale':[],'Target':[]}
        features = {'all':[],'cat':[],'int':[],'target':[]}
        
        for line in self.features_md.split('\n'):
            if line[:3] == '###':
                s = line.replace('### ','')
                topic = re.sub(r'\W+', '', s)
            if line[:2] == '- ':
                s = line.replace('- ','')
                dtype = s.split('(')[1].split(')')[0]
                feature = re.sub(r'\W+', '', s.split('(')[0].strip())

                topics[topic].append((topic,dtype,feature))
                features['all'].append(feature)
                features[dtype].append(feature)
        
        self.features = features
        self.topics = topics
    
    def clean(self):
        cleaner = Cleaner(self.df_input_train,self.features)
        self.train_cleaner = cleaner
        cleaner.clean()
        cleaner.wrangle()
        self.df_train = cleaner.df
        self.prediction_data['train'] = cleaner.prediction_data
        self.train_cleaner = cleaner
        self.features = cleaner.features
                                         
        cleaner = Cleaner(self.df_input_test,self.features,test=True)
        self.test_cleaner = cleaner
        cleaner.clean()
        cleaner.wrangle()
        self.df_test = cleaner.df
        self.prediction_data['test'] = cleaner.prediction_data
        self.test_cleaner = cleaner
    
    def predict(self):
        predictor = Predictor(self.prediction_data,self.features)
        self.predictor = predictor  
        predictor.split()
        predictor.normalize()
        predictor.gbr()
        predictor.knn()
        predictor.lr()
        predictor.assess(predictor.model_gbr)
        # predictor.predict()
        predictor.df_output = predictor.df_output
        self.predictor = predictor  
    
    def wrap_up(self):
        self.df_output.to_csv(output_file,header=True)

In [220]:
class Cleaner(object):
    """
    """
    def __init__(self,df,features,test=False):
        self.df_input = df
        self.features = features
        self.test = test
        self.df = None
        self.prediction_data = {
            'X': None,
            'y': None,
        }
        
    def clean(self):
        df = self.df_input[self.features['all']]
        
        dummies = pd.get_dummies(df[self.features['cat']])
        df = dummies.join(df[self.features['int']+self.features['target']],how='inner')
        
        self.features['dummies'] = list(dummies.columns)
        self.df = df
    
    def wrangle(self):
        df = self.df
        if not self.test:
            self.prediction_data['fit'] = {}
            self.prediction_data['score'] = {}
        self.prediction_data['y'] = df.SalePrice
        df = df.drop(['SalePrice'],axis=1)
        self.prediction_data['X'] = df

In [221]:
class Predictor(object):
    """
    """
    def __init__(self,prediction_data,features):
        self.prediction_data = prediction_data
        self.features = features
        
        self.ensemble = None
        self.model_gbr = None
        self.model_knn = None
        self.model_lr = None
        self.df_output = None
    
    def split(self):
        (self.prediction_data['train']['fit']['X'],
        self.prediction_data['train']['score']['X'],
        self.prediction_data['train']['fit']['y'],
        self.prediction_data['train']['score']['y']) = train_test_split(
            self.prediction_data['train']['X'],
            self.prediction_data['train']['y'],
            test_size=0.2, random_state=42
        )
    
    def normalize(self):
        sc = StandardScaler()
        self.prediction_data['train']['fit']['X'] = (
            self.prediction_data['train']['fit']['X'][self.features['dummies']].join(
                pd.DataFrame(sc.fit_transform(
                    self.prediction_data['train']['fit']['X'][self.features['int']]
                ),columns=self.features['int'],index=self.prediction_data['train']['fit']['X'].index
                ),how='inner'
            )
        )
        self.prediction_data['train']['score']['X'] = (
            self.prediction_data['train']['score']['X'][self.features['dummies']].join(
                pd.DataFrame(sc.fit_transform(
                    self.prediction_data['train']['score']['X'][self.features['int']]
                ),columns=self.features['int'],index=self.prediction_data['train']['score']['X'].index
                ),how='inner'
            )
        )
        
    def gbr(self):
        gbr_params = {
            'n_estimators': 1000,
            'max_depth': 3,
            'min_samples_split': 5,
            'learning_rate': 0.01,
            'loss': 'squared_error',
        }
        model_gbr = GradientBoostingRegressor(**gbr_params)
        model_gbr.fit(**self.prediction_data['train']['fit'])
        self.model_gbr = model_gbr
        
    def knn(self):
        model_knn = None
        self.model_knn = model_knn
    
    def lr(self):
        model_lr = None
        self.model_lr = model_lr
        
    def assess(self,model):
        corr = model.score(**self.prediction_data['train']['score'])
        print('GBR Model Accuracy:',round(corr,4))
        mse = mean_squared_error(self.prediction_data['train']['score']['y'], model.predict(self.prediction_data['train']['score']['X']))
        print("The GBR model MSE on test set:",round(mse,4))
        
    def predict(self):
        pred = self.model.predict(self.prediction_data['test']['X'])
        self.prediction_data['test']['y'] = pd.DataFrame(pred,columns=['SalePrice'])

In [222]:
analysis = Analysis()

analysis.clean()
analysis.predict()
# analysis.wrap_up()

GBR Model Accuracy: 0.8913
The GBR model MSE on test set: 833808039.7465


In [None]:
class Analyzer(object):
    def __init__(self,df):
        self.df = df
    
    def eda(self):
        print(self.df.columns)
       #print(self.df.head())
        for cname,value in self.df.iloc[0,:].to_dict().items():
            column = self.df.loc[:,cname]
            print(cname,': ',value,type(value),len(column.unique()))
            if len(column.unique()) < 20:
                print(dict(column.value_counts()))
        
    def correlate(self):
        None
        
    def plot(self):
        None

analyzer = Analyzer(analysis.df_input_train)

In [None]:
analyzer.correlate()

In [None]:
analyzer.plot()

In [195]:
analyzer.eda()

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo