In [1]:
import pandas as pd
import numpy as np
import warnings
import types

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix

from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt 

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

df = pd.read_csv('General_Files/train.csv')
df = df.sort_values('SalePrice',ascending=True)
df = df.drop(['Id'],axis=1)


In [2]:
class Execution_Pipeline:
    method_list = []
    
    def add(self, method:types.FunctionType):
        already_exists = False
        for f in self.method_list:
            already_exists = f.__name__ == method.__name__
        
        if not already_exists:
            self.method_list.append(method)
    
    def exec_pipe(self,parameter:pd.DataFrame):
        try:
            for f in self.method_list:
                f(parameter)
                
        except Exception as e:
            return e
        
pipe = Execution_Pipeline()

### Extracting the Indexes of Columns to Action Groups of we will work

In [3]:
cols_std=['YearBuilt','YearRemodAdd','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','HalfBath','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea']
cols_norm=['LotArea','BsmtFinSF1','BsmtFullBath','BsmtHalfBath','FullBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars','MiscVal','MoSold','YrSold','OverallQual', 'OverallCond']
cols_cat=['MSSubClass','MSZoning','LotShape','LotConfig','RoofStyle','MasVnrType','ExterQual','BsmtExposure','HeatingQC','KitchenQual','GarageType','Neighborhood','HouseStyle','Exterior1st','Exterior2nd','Foundation','BsmtQual','BsmtFinType1','FireplaceQu','GarageFinish']
cols_NA = ['BsmtExposure','GarageType','BsmtQual','BsmtFinType1','FireplaceQu','GarageFinish']

all_cols = cols_std + cols_norm + cols_cat
aux = df[all_cols].copy()

nums_idx, cat_idx, std_idx, norm_idx, na_idx = [],[],[],[],[]

for c in aux.columns:
    if df[c].dtype == 'O':
        cat_idx+=[aux.columns.get_loc(c)]
    else:
        nums_idx+=[aux.columns.get_loc(c)]
        if c in cols_std:
            std_idx+=[aux.columns.get_loc(c)]
        else:
            norm_idx+=[aux.columns.get_loc(c)]
    if c in cols_NA:
        na_idx+=[aux.columns.get_loc(c)]

x = df[all_cols].copy().values
y = df['SalePrice'].copy().values


### Separando conjunto de teste e treino


In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

### Transforming MSSubClass to categorical data

In [5]:
before = x_train[:, cat_idx[0]].dtype

def procedure_01(dataframe):
    dataframe[ :, cat_idx[0] ] = dataframe[:, cat_idx[0]].astype(str)
   
pipe.add(procedure_01)

procedure_01(x_train)
print('Type before convertion is {}. Now MSSubClass data type is {}'.format(before, x_train[:, cat_idx[0]].dtype))

Type before convertion is object. Now MSSubClass data type is object


### Transforming nan values to 'NA' data

In [6]:
      
def procedure_02(dataframe):
    dataframe[:,na_idx] = pd.DataFrame(dataframe[:,na_idx]).fillna('NA').values

pipe.add(procedure_02)
procedure_02(x_train)

print("\nThese is columns with 'NA' values after convertion:")
for col in na_idx:
    print('\tidx{}: {}'.format(col, np.unique(x_train[:,col])))



These is columns with 'NA' values after convertion:
	idx39: ['Av' 'Gd' 'Mn' 'NA' 'No']
	idx42: ['2Types' 'Attchd' 'Basment' 'BuiltIn' 'CarPort' 'Detchd' 'NA']
	idx48: ['Ex' 'Fa' 'Gd' 'NA' 'TA']
	idx49: ['ALQ' 'BLQ' 'GLQ' 'LwQ' 'NA' 'Rec' 'Unf']
	idx50: ['Ex' 'Fa' 'Gd' 'NA' 'Po' 'TA']
	idx51: ['Fin' 'NA' 'RFn' 'Unf']


### Applying procedures of standardization, normalization, missing values treatment and one hot encoding

In [7]:
class Encoding_Steps:
    encoders={}
    imputers={}
    
    def __init__(self):
        self.encoders['std'] = StandardScaler()
        self.encoders['norm'] = MinMaxScaler()
        self.encoders['hot'] = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'),cat_idx)], remainder='passthrough')
        self.imputers['numerical'] = SimpleImputer(missing_values=np.nan,strategy='median')
        self.imputers['categorical'] = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
        
    
    def procedure_03_train(self,dataframe):
        dataframe[:,nums_idx] = self.imputers['numerical'].fit_transform(dataframe[:,nums_idx])
        dataframe[:,cat_idx] = self.imputers['categorical'].fit_transform(dataframe[:,cat_idx])
    
    def procedure_03_test(self,dataframe):
        dataframe[:,nums_idx] = self.imputers['numerical'].transform(dataframe[:,nums_idx])
        dataframe[:,cat_idx] = self.imputers['categorical'].transform(dataframe[:,cat_idx])
        
    def procedure_04_train(self,dataframe):
        dataframe[:,std_idx]= self.encoders['std'].fit_transform(dataframe[:,std_idx])
        dataframe[:,norm_idx] = self.encoders['norm'].fit_transform(dataframe[:,norm_idx])
    
    def procedure_04_test(self,dataframe):
        dataframe[:,std_idx]= self.encoders['std'].transform(dataframe[:,std_idx])
        dataframe[:,norm_idx] = self.encoders['norm'].transform(dataframe[:,norm_idx])
    
    def procedure_05_train(self,dataframe):
        return self.encoders['hot'].fit_transform(dataframe).toarray()
        
    def procedure_05_test(self,dataframe):
        return self.encoders['hot'].transform(dataframe).toarray()
        
encoding_steps = Encoding_Steps()


In [8]:
# step 3: missing values treatment -> median and most_frequent imputers
encoding_steps.procedure_03_train(x_train)
pipe.add(encoding_steps.procedure_03_test)

In [9]:
# step4: standardize and normalize process
encoding_steps.procedure_04_train(x_train)
pipe.add(encoding_steps.procedure_04_test)

In [10]:
# step5: one hot encoding
x_train = encoding_steps.procedure_05_train(x_train)
pipe.add(encoding_steps.procedure_05_test)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly_reg = PolynomialFeatures(degree = 3)
x_poly = poly_reg.fit_transform(x_train)
lin_reg = LinearRegression()
lin_reg.fit(x_poly, y_train)

plt.scatter(x_train, y_train, color = 'red')
plt.plot(x_train, lin_reg_2.predict(poly_reg.fit_transform(x_train)), color = 'blue')
plt.title('Truth or Bluff (Polynomial Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
