In [34]:
# System
import os
import datetime

# Data Analysis
import re
import matplotlib as mpl
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# Machine Learning
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

In [71]:
class Analysis(object):
    """
    """
    def __init__(self):
        cwd = os.getcwd()
        kaggle = os.path.join(cwd,'kaggle')
        
        with open(os.path.join(kaggle,'data_description.txt'),'r') as f:
            self.data_description = f.read()
        self.df_input_train = pd.read_csv(os.path.join(kaggle,'train.csv'),header=0,index_col=0)
        self.df_input_test = pd.read_csv(os.path.join(kaggle,'test.csv'),header=0,index_col=0).assign(SalePrice=0)
        self.df_output_sample = pd.read_csv(os.path.join(kaggle,'sample_submission.csv'),header=0,index_col=0)
        
        self.preprocessor = None
        self.predictor = None
        
        self.input_data = {'train':self.df_input_train,'test':self.df_input_test}
        self.prediction_data = {'train': None,'test': None}
        
        self.df_output = None
        self.output_file = os.path.join(cwd,'housing_analysis-tsj7ww-TrevorJordan.csv')
        
        self.topics = None
        self.input_features = None
        self.features = None
        with open(os.path.join(cwd,'features.md'),'r') as f:
            self.features_md = f.read()
            f.close()
    
        self._get_features()
    
    def _get_features(self):
        topics = {'House':[],'Lot':[],'Location':[],'Sale':[],'Target':[]}
        features = {'all':[],'cat':[],'int':[],'target':[]}
        
        for line in self.features_md.split('\n'):
            if line[:3] == '###':
                s = line.replace('### ','')
                topic = re.sub(r'\W+', '', s)
            if line[:2] == '- ':
                s = line.replace('- ','')
                dtype = s.split('(')[1].split(')')[0]
                feature = re.sub(r'\W+', '', s.split('(')[0].strip())

                topics[topic].append((topic,dtype,feature))
                features['all'].append(feature)
                features[dtype].append(feature)
        
        self.input_features = features
        self.topics = topics
    
    def preprocess(self):
        preprocessor = Preprocessor(self.input_data,self.input_features,self.topics)
        self.preprocessor = preprocessor
        preprocessor.combine()
        preprocessor.clean()
        preprocessor.dummy()
        preprocessor.select()
        preprocessor.normalize()
        preprocessor.wrangle()
        self.prediction_data = preprocessor.prediction_data
        self.preprocessor = preprocessor
        self.features = preprocessor.features
    
    def predict(self):
        predictor = Predictor(self.prediction_data,self.features)
        self.predictor = predictor
        predictor.gbr()
        predictor.knn()
        predictor.ridge()
        predictor.assess(predictor.model_gbr)
        predictor.assess(predictor.model_knn)
        predictor.assess(predictor.model_ridge)
        # predictor.predict([predictor.model_gbr,predictor.model_knn,predictor.model_ridge])
        predictor.df_output = predictor.df_output
        self.predictor = predictor  
    
    def wrap_up(self):
        self.df_output.to_csv(output_file,header=True)

In [80]:
class Preprocessor(object):
    """
    """
    def __init__(self,data,features,topics):
        self.input_data = data
        self.features = features
        self.topics = topics
        
        self.data = None
        self.prediction_data = {
            'X': None,
            'y': None,
        }
    
    def combine(self):
        dtypes = {}
        for topic,columns in self.topics.items():
            for col in columns:
                if col[2]=='SalePrice':
                    dtypes[col[2]] = int
                elif col[1]=='int':
                    dtypes[col[2]] = np.float32
                elif col[1]=='cat':
                    dtypes[col[2]] = object
                else:
                    object
        df = pd.concat([
            self.input_data['train'].assign(Segment='train'),
            self.input_data['test'].assign(Segment='test')
        ]).reset_index(drop=True).fillna(0).astype(dtype=dtypes)
        
        data = {}
        data['y'] = df[['SalePrice','Segment']]
        df = df.drop(['SalePrice'],axis=1)
        data['X'] = df
        self.data = data
    
    def clean(self):
        None
        
    def dummy(self):
        df = self.data['X']

        dummies = pd.get_dummies(df[self.features['cat']])
        df = (dummies
              .join(df[
                  ['Segment']+self.features['int']
              ],how='inner')
         )
        self.data['dummied'] = df
        self.features['dummies'] = list(dummies.columns)
    
    def select(self):
        kbest = SelectKBest(score_func=f_regression, k=20)
        X = self.data['dummied'].loc[self.data['dummied'].Segment=='train'].drop(['Segment'],axis=1)
        y = self.data['y'].loc[self.data['y'].Segment=='train'].SalePrice
        segment = self.data['dummied'].Segment
        fit = kbest.fit(X,y)
        selected = kbest.get_feature_names_out(kbest.feature_names_in_)
        
        self.data['selected'] = pd.DataFrame(
            fit.transform(self.data['dummied'].drop(['Segment'],axis=1)),
            columns=selected,
            index=self.data['dummied'].index
        ).join(segment,how='inner')
        
        self.features['selected'] = selected
        for dtype,cols in self.features.items():
            self.features[dtype] = [i for i in cols if i in selected]
    
    def normalize(self):
        sc = StandardScaler()
        normalized = pd.DataFrame(
            sc.fit_transform(self.data['selected'][self.features['int']]),
            columns=self.features['int'],index=self.data['selected'].index
        )
        self.data['normalized'] = normalized.join(self.data['selected'][['Segment']+self.features['dummies']],how='inner')
    
    def wrangle(self):
        test = {
            'X': self.data['normalized'].loc[self.data['normalized'].Segment=='test'].drop(['Segment'],axis=1),
            'y': self.data['y'].loc[self.data['y'].Segment=='test'].drop(['Segment'],axis=1)
        }
        _ = {'X':None,'y':None}
        train = {
            'fit': _,
            'score': _,
        }
        
        (train['fit']['X'],train['score']['X'],
        train['fit']['y'],train['score']['y']) = train_test_split(
            self.data['normalized'].loc[self.data['normalized'].Segment=='train'].drop(['Segment'],axis=1),
            self.data['y'].loc[self.data['y'].Segment=='train'].SalePrice,
            test_size=0.2, random_state=42
        )
        
        prediction_data = {'train':train,'test':test}
        self.prediction_data = prediction_data

In [81]:
class Predictor(object):
    """
    """
    def __init__(self,prediction_data,features):
        self.prediction_data = prediction_data
        self.features = features
        
        self.model_gbr = None
        self.model_knn = None
        self.model_ridge = None
        
        self.df_output = None
    
    def gbr(self):
        gbr_params = {
            'n_estimators': 1000,
            'max_depth': 3,
            'min_samples_split': 5,
            'learning_rate': 0.01,
            'loss': 'squared_error',
        }
        model_gbr = GradientBoostingRegressor(**gbr_params)
        model_gbr.fit(**self.prediction_data['train']['fit'])
        self.model_gbr = model_gbr
        
    def knn(self):
        best = {'n':0,'corr':0,'mse':0,'model':None}
        for n in range(20):
            n+=1
            model_knn = KNeighborsRegressor(n_neighbors=n)
            model_knn.fit(**self.prediction_data['train']['fit'])
            corr,mse = self.assess(model_knn,tune=True)
            if best['corr'] < corr:
                best = {'n':n,'corr':corr,'mse':mse,'model':model_knn}
        print(best['n'])
        self.model_knn = best['model']
    
    def ridge(self):
        best = {'a':0,'corr':0,'mse':0,'model':None}
        a = 0.3
        for i in range(20):
            a+=0.1
            model_ridge = Ridge(alpha=a)
            model_ridge.fit(**self.prediction_data['train']['fit'])
            corr,mse = self.assess(model_ridge,tune=True)
            if best['corr'] < corr:
                best = {'a':a,'corr':corr,'mse':mse,'model':model_ridge}
        print(best['a'])
        self.model_ridge = model_ridge
        
    def assess(self,model,tune=False):
        corr = model.score(**self.prediction_data['train']['score'])
        mse = mean_squared_error(self.prediction_data['train']['score']['y'], model.predict(self.prediction_data['train']['score']['X']))
        if tune:
            return (corr,mse)
        else:
            print('Model Accuracy:',round(corr,4))
            print("Model MSE on test set:",round(mse,4))
        
    def predict(self,models):
        preds = []
        for model in models:
            preds.append(model.predict(self.prediction_data['test']['X']))
        pred = (sum(preds)/len(preds)).round(2)
        print(pred)
        self.prediction_data['test']['y'] = pred
        self.df_output = pd.DataFrame(pred,columns=['SalePrice'])

In [82]:
analysis = Analysis()

analysis.preprocess()
analysis.predict()
# analysis.wrap_up()

  uniques = Index(uniques)
  correlation_coefficient /= X_norms


1
0.4
Model Accuracy: 0.9627
Model MSE on test set: 286105214.4248
Model Accuracy: 0.9999
Model MSE on test set: 514417.8082
Model Accuracy: 0.8333
Model MSE on test set: 1279016364.7886


In [None]:
class Analyzer(object):
    def __init__(self,df):
        self.df = df
    
    def eda(self):
        print(self.df.columns)
       #print(self.df.head())
        for cname,value in self.df.iloc[0,:].to_dict().items():
            column = self.df.loc[:,cname]
            print(cname,': ',value,type(value),len(column.unique()))
            if len(column.unique()) < 20:
                print(dict(column.value_counts()))
        
    def correlate(self):
        None
        
    def plot(self):
        None

analyzer = Analyzer(analysis.df_input_train)