In [1]:
import pandas as pd
import numpy as np
import warnings
import types

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt 

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

df = pd.read_csv('General_Files/train.csv')
df = df.drop(['Id'],axis=1)


In [2]:
class Execution_Pipeline:
    method_list = []
    
    def add(self,method:types.FunctionType):
        already_exists = False
        for f in self.method_list:
            already_exists = f.__name__ == method.__name__
        
        if not already_exists:
            self.method_list.append(method)
    
    def exec_pipe(self,parameter:pd.DataFrame):
        try:
            for f in self.method_list:
                f(parameter)
                
        except Exception as e:
            return e
        
pipe = Execution_Pipeline()

### Extracting the Indexes of Columns to Action Groups of we will work

In [3]:
cols_std=['YearBuilt','YearRemodAdd','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','HalfBath','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea']
cols_norm=['LotArea','BsmtFinSF1','BsmtFullBath','BsmtHalfBath','FullBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars','MiscVal','MoSold','YrSold','OverallQual', 'OverallCond']
cols_cat=['MSSubClass','MSZoning','LotShape','LotConfig','RoofStyle','MasVnrType','ExterQual','BsmtExposure','HeatingQC','KitchenQual','GarageType','Neighborhood','HouseStyle','Exterior1st','Exterior2nd','Foundation','BsmtQual','BsmtFinType1','FireplaceQu','GarageFinish']
cols_NA = ['BsmtExposure','GarageType','BsmtQual','BsmtFinType1','FireplaceQu','GarageFinish']

all_cols = cols_std + cols_norm + cols_cat
aux = df[all_cols].copy()

nums_idx, cat_idx, std_idx, norm_idx, na_idx = [],[],[],[],[]

for c in aux.columns:
    if df[c].dtype == 'O':
        cat_idx+=[aux.columns.get_loc(c)]
    else:
        nums_idx+=[aux.columns.get_loc(c)]
        if c in cols_std:
            std_idx+=[aux.columns.get_loc(c)]
        else:
            norm_idx+=[aux.columns.get_loc(c)]
    if c in cols_NA:
        na_idx+=[aux.columns.get_loc(c)]

x = df[all_cols].copy().values
y = df['SalePrice'].copy().values


### Separando conjunto de teste e treino


In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

### Transforming MSSubClass to categorical data

In [5]:
before = x_train[:, cat_idx[0]].dtype

def procedure_01(dataframe):
    dataframe[ :, cat_idx[0] ] = dataframe[:, cat_idx[0]].astype(str)
   
pipe.add(procedure_01)

procedure_01(x_train)
print('Type before convertion is {}. Now MSSubClass data type is {}'.format(before, x_train[:, cat_idx[0]].dtype))

Type before convertion is object. Now MSSubClass data type is object


### Transforming nan values to 'NA' data

In [6]:
      
def procedure_02(dataframe):
    dataframe[:,na_idx] = pd.DataFrame(dataframe[:,na_idx]).fillna('NA').values

pipe.add(procedure_02)
procedure_02(x_train)

print("\nThese is columns with 'NA' values after convertion:")
for col in na_idx:
    print('\tidx{}: {}'.format(col, np.unique(x_train[:,col])))



These is columns with 'NA' values after convertion:
	idx39: ['Av' 'Gd' 'Mn' 'NA' 'No']
	idx42: ['2Types' 'Attchd' 'Basment' 'BuiltIn' 'CarPort' 'Detchd' 'NA']
	idx48: ['Ex' 'Fa' 'Gd' 'NA' 'TA']
	idx49: ['ALQ' 'BLQ' 'GLQ' 'LwQ' 'NA' 'Rec' 'Unf']
	idx50: ['Ex' 'Fa' 'Gd' 'NA' 'Po' 'TA']
	idx51: ['Fin' 'NA' 'RFn' 'Unf']


### Applying procedures of standardization, normalization, missing values treatment and one hot encoding

In [7]:
class Encoding_Steps:
    encoders={}
    imputers={}
    
    def __init__(self):
        self.encoders['std'] = StandardScaler()
        self.encoders['norm'] = MinMaxScaler()
        self.encoders['hot'] = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'),cat_idx)], remainder='passthrough')
        self.imputers['numerical'] = SimpleImputer(missing_values=np.nan,strategy='median')
        self.imputers['categorical'] = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
        
    
    def procedure_03_train(self,dataframe):
        dataframe[:,nums_idx] = self.imputers['numerical'].fit_transform(dataframe[:,nums_idx])
        dataframe[:,cat_idx] = self.imputers['categorical'].fit_transform(dataframe[:,cat_idx])
    
    def procedure_03_test(self,dataframe):
        dataframe[:,nums_idx] = self.imputers['numerical'].transform(dataframe[:,nums_idx])
        dataframe[:,cat_idx] = self.imputers['categorical'].transform(dataframe[:,cat_idx])
        
    def procedure_04_train(self,dataframe):
        dataframe[:,std_idx]= self.encoders['std'].fit_transform(dataframe[:,std_idx])
        dataframe[:,norm_idx] = self.encoders['norm'].fit_transform(dataframe[:,norm_idx])
    
    def procedure_04_test(self,dataframe):
        dataframe[:,std_idx]= self.encoders['std'].transform(dataframe[:,std_idx])
        dataframe[:,norm_idx] = self.encoders['norm'].transform(dataframe[:,norm_idx])
    
    def procedure_05_train(self,dataframe):
        transformed_data = np.array(self.encoders['hot'].fit_transform(dataframe))
        np.copyto(dataframe, transformed_data, casting='unsafe', where=False)
        
    def procedure_05_test(self,dataframe):
        transformed_data = np.array(self.encoders['hot'].transform(dataframe))
        np.copyto(dataframe, transformed_data, casting='unsafe', where=False)
        
encoding_steps = Encoding_Steps()


In [8]:
# step 3: missing values treatment -> median and most_frequent imputers
encoding_steps.procedure_03_train(x_train)
pipe.add(encoding_steps.procedure_03_test)
pd.DataFrame(x_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51
0,2007.0,2007.0,0.0,1774.0,1822.0,1828.0,0.0,0.0,1828.0,0.0,774.0,0.0,108.0,0.0,0.0,260.0,0.0,11694.0,48.0,0.0,0.0,2.0,3.0,1.0,9.0,1.0,3.0,0.0,7.0,2007.0,9.0,5.0,20.0,RL,Reg,Inside,Hip,BrkFace,Ex,Av,Ex,Gd,Attchd,NridgHt,1Story,CemntBd,CmentBd,PConc,Ex,GLQ,Gd,Unf
1,1962.0,1962.0,0.0,894.0,894.0,894.0,0.0,0.0,894.0,0.0,308.0,0.0,0.0,0.0,0.0,0.0,0.0,6600.0,0.0,0.0,0.0,1.0,2.0,1.0,5.0,0.0,1.0,0.0,8.0,2009.0,5.0,5.0,20.0,RL,Reg,Inside,Hip,,TA,No,Gd,TA,Detchd,NAmes,1Story,MetalSd,MetalSd,CBlock,TA,Unf,,Unf
2,1921.0,2006.0,0.0,163.0,876.0,964.0,0.0,0.0,964.0,0.0,432.0,0.0,0.0,44.0,0.0,0.0,0.0,13360.0,713.0,1.0,0.0,1.0,2.0,1.0,5.0,0.0,2.0,0.0,8.0,2009.0,5.0,7.0,30.0,RL,IR1,Inside,Gable,,TA,No,Ex,TA,Detchd,Crawfor,1Story,Wd Sdng,Wd Sdng,BrkTil,Gd,ALQ,,Unf
3,2002.0,2002.0,0.0,350.0,1568.0,1689.0,0.0,0.0,1689.0,0.0,857.0,150.0,59.0,0.0,0.0,0.0,0.0,13265.0,1218.0,1.0,0.0,2.0,3.0,1.0,7.0,2.0,3.0,0.0,7.0,2008.0,8.0,5.0,20.0,RL,IR1,CulDSac,Hip,BrkFace,Gd,No,Ex,Gd,Attchd,Mitchel,1Story,CemntBd,CmentBd,PConc,Gd,GLQ,Gd,RFn
4,2001.0,2002.0,0.0,1541.0,1541.0,1541.0,0.0,0.0,1541.0,0.0,843.0,468.0,81.0,0.0,0.0,0.0,0.0,13704.0,0.0,0.0,0.0,2.0,3.0,1.0,6.0,1.0,3.0,0.0,1.0,2006.0,7.0,5.0,20.0,RL,IR1,Corner,Gable,BrkFace,Gd,No,Ex,Gd,Attchd,CollgCr,1Story,VinylSd,VinylSd,PConc,Gd,Unf,TA,RFn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,1999.0,1999.0,0.0,89.0,1252.0,1268.0,1097.0,0.0,2365.0,1.0,856.0,0.0,128.0,0.0,0.0,180.0,0.0,9430.0,1163.0,1.0,0.0,2.0,3.0,1.0,8.0,1.0,3.0,0.0,7.0,2009.0,8.0,5.0,60.0,RL,Reg,Inside,Gable,BrkFace,Gd,Mn,Ex,Gd,Attchd,NoRidge,2Story,VinylSd,VinylSd,PConc,Gd,GLQ,Gd,RFn
1164,1950.0,1995.0,0.0,625.0,1067.0,1067.0,0.0,0.0,1067.0,0.0,436.0,290.0,0.0,0.0,0.0,0.0,0.0,9600.0,442.0,0.0,0.0,2.0,2.0,1.0,4.0,0.0,2.0,0.0,2.0,2010.0,4.0,7.0,20.0,RL,Reg,Inside,Gable,,TA,No,TA,Gd,Attchd,Sawyer,1Story,VinylSd,HdBoard,CBlock,Gd,BLQ,,Unf
1165,1978.0,1978.0,0.0,0.0,0.0,1318.0,584.0,0.0,1902.0,0.0,539.0,0.0,0.0,0.0,0.0,0.0,0.0,8930.0,0.0,0.0,0.0,2.0,4.0,2.0,8.0,0.0,2.0,0.0,4.0,2010.0,6.0,5.0,90.0,RM,Reg,Inside,Gable,,TA,,TA,TA,Attchd,Sawyer,1.5Fin,VinylSd,VinylSd,Slab,,,,Unf
1166,2003.0,2004.0,0.0,1374.0,1374.0,1557.0,0.0,0.0,1557.0,0.0,420.0,143.0,20.0,0.0,0.0,0.0,0.0,3196.0,0.0,0.0,0.0,2.0,2.0,1.0,7.0,1.0,2.0,0.0,10.0,2006.0,7.0,5.0,120.0,RL,Reg,Inside,Gable,BrkFace,Gd,Gd,Ex,Gd,Attchd,Blmngtn,1Story,VinylSd,VinylSd,PConc,Gd,Unf,TA,Fin


In [9]:
# step4: standardize and normalize process
encoding_steps.procedure_04_train(x_train)
pipe.add(encoding_steps.procedure_04_test)
pd.DataFrame(x_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51
0,1.188032,1.078914,-0.291823,2.757198,1.865729,1.78757,-0.802555,-0.125918,0.601886,-0.773624,1.434743,-0.742575,0.874116,-0.364703,-0.115333,4.546911,-0.058085,0.048583,0.021239,0.0,0.0,0.666667,0.375,0.333333,0.583333,0.333333,0.75,0.0,0.545455,0.25,0.888889,0.5,0.0,RL,Reg,Inside,Hip,BrkFace,Ex,Av,Ex,Gd,Attchd,NridgHt,1Story,CemntBd,CmentBd,PConc,Ex,GLQ,Gd,Unf
1,-0.292501,-1.097548,-0.291823,0.745229,-0.387262,-0.71541,-0.802555,-0.125918,-1.216718,-0.773624,-0.791166,-0.742575,-0.700461,-0.364703,-0.115333,-0.269109,-0.058085,0.024773,0.0,0.0,0.0,0.333333,0.25,0.333333,0.25,0.0,0.25,0.0,0.636364,0.75,0.444444,0.5,0.0,RL,Reg,Inside,Hip,,TA,No,Gd,TA,Detchd,NAmes,1Story,MetalSd,MetalSd,CBlock,TA,Unf,,Unf
2,-1.641431,1.030548,-0.291823,-0.926076,-0.430962,-0.52782,-0.802555,-0.125918,-1.08042,-0.773624,-0.198864,-0.742575,-0.700461,0.332315,-0.115333,-0.269109,-0.058085,0.05637,0.315487,0.333333,0.0,0.333333,0.25,0.333333,0.25,0.0,0.5,0.0,0.636364,0.75,0.444444,0.75,0.058824,RL,IR1,Inside,Gable,,TA,No,Ex,TA,Detchd,Crawfor,1Story,Wd Sdng,Wd Sdng,BrkTil,Gd,ALQ,,Unf
3,1.023528,0.837085,-0.291823,-0.498533,1.24907,1.415071,-0.802555,-0.125918,0.331238,-0.773624,1.831204,0.436137,0.159725,-0.364703,-0.115333,-0.269109,-0.058085,0.055926,0.538938,0.333333,0.0,0.666667,0.375,0.333333,0.416667,0.666667,0.75,0.0,0.545455,0.5,0.777778,0.5,0.0,RL,IR1,CulDSac,Hip,BrkFace,Gd,No,Ex,Gd,Attchd,Mitchel,1Story,CemntBd,CmentBd,PConc,Gd,GLQ,Gd,RFn
4,0.990627,0.837085,-0.291823,2.224484,1.183519,1.018453,-0.802555,-0.125918,0.043065,-0.773624,1.764331,2.935007,0.480472,-0.364703,-0.115333,-0.269109,-0.058085,0.057978,0.0,0.0,0.0,0.666667,0.375,0.333333,0.333333,0.333333,0.75,0.0,0.0,0.0,0.666667,0.5,0.0,RL,IR1,Corner,Gable,BrkFace,Gd,No,Ex,Gd,Attchd,CollgCr,1Story,VinylSd,VinylSd,PConc,Gd,Unf,TA,RFn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,0.924826,0.691987,-0.291823,-1.095265,0.481888,0.286854,1.702295,-0.125918,1.647487,1.238832,1.826427,-0.742575,1.165705,-0.364703,-0.115333,3.065059,-0.058085,0.038,0.514602,0.333333,0.0,0.666667,0.375,0.333333,0.5,0.333333,0.75,0.0,0.545455,0.75,0.777778,0.5,0.235294,RL,Reg,Inside,Gable,BrkFace,Gd,Mn,Ex,Gd,Attchd,NoRidge,2Story,VinylSd,VinylSd,PConc,Gd,GLQ,Gd,RFn
1164,-0.68731,0.498524,-0.291823,0.130207,0.032746,-0.251796,-0.802555,-0.125918,-0.879867,-0.773624,-0.179757,1.536269,-0.700461,-0.364703,-0.115333,-0.269109,-0.058085,0.038795,0.195575,0.0,0.0,0.666667,0.25,0.333333,0.166667,0.0,0.5,0.0,0.090909,1.0,0.333333,0.75,0.0,RL,Reg,Inside,Gable,,TA,No,TA,Gd,Attchd,Sawyer,1Story,VinylSd,HdBoard,CBlock,Gd,BLQ,,Unf
1165,0.233911,-0.323695,-0.291823,-1.298748,-2.557708,0.420847,0.530929,-0.125918,0.745973,-0.773624,0.312235,-0.742575,-0.700461,-0.364703,-0.115333,-0.269109,-0.058085,0.035663,0.0,0.0,0.0,0.666667,0.5,0.666667,0.5,0.0,0.5,0.0,0.272727,1.0,0.555556,0.5,0.411765,RM,Reg,Inside,Gable,,TA,,TA,TA,Attchd,Sawyer,1.5Fin,VinylSd,VinylSd,Slab,,,,Unf
1166,1.056429,0.933817,-0.291823,1.842667,0.778078,1.061331,-0.802555,-0.125918,0.074219,-0.773624,-0.256184,0.38113,-0.408873,-0.364703,-0.115333,-0.269109,-0.058085,0.008862,0.0,0.0,0.0,0.666667,0.25,0.333333,0.416667,0.333333,0.5,0.0,0.818182,0.0,0.666667,0.5,0.588235,RL,Reg,Inside,Gable,BrkFace,Gd,Gd,Ex,Gd,Attchd,Blmngtn,1Story,VinylSd,VinylSd,PConc,Gd,Unf,TA,Fin


In [11]:
# step5: one hot encoding

#x_train = encoding_steps.procedure_05_train(x_train)
#pipe.add(encoding_steps.procedure_05_test)

# we must fix some error on procedure_05_train; dataframe return is not equal sparse matrix.

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'),cat_idx)], remainder='passthrough')
x_train = np.array(ct.fit_transform(x_train))
x_train


In [None]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

plt.scatter(x_train, y_train, color = 'red')
plt.plot(x_train, y_pred, color = 'blue')

plt.title('Multiple Linear Regression')
plt.xlabel(' x ')
plt.ylabel('SalePrice')
plt.show()
