<a href="https://colab.research.google.com/github/terrence-brian/ML-Task-2/blob/main/ML_Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [192]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.decomposition import PCA

# Importing Dataset to Colab

In [108]:
from google.colab import files
uploaded = files.upload()

Saving modified_data.csv to modified_data (2).csv


In [109]:
data = pd.read_csv('modified_data.csv')

In [110]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


# Preparing Dataset for Training

In [111]:
#Drop Id column
data.drop('Id',inplace=True,axis=1)

In [112]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


Size of the dataset

In [113]:
data.shape

(1460, 76)

In [114]:
len(data)

1460

In [115]:
#Checking for any missing values in existing columns
#data.isna().any()
data.isnull().any()

MSSubClass       False
MSZoning         False
LotFrontage       True
LotArea          False
Street           False
                 ...  
MoSold           False
YrSold           False
SaleType         False
SaleCondition    False
SalePrice        False
Length: 76, dtype: bool

In [116]:
data.isnull().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 76, dtype: int64

In [117]:
original_features = list(data.columns)
original_features
len(original_features)

76

In [118]:
features_with_missing = list(data.columns[data.isna().any()])
features_with_missing
len(features_with_missing)

15

In [119]:
data[features_with_missing].isna().sum()

LotFrontage     259
MasVnrType        8
MasVnrArea        8
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Electrical        1
FireplaceQu     690
GarageType       81
GarageYrBlt      81
GarageFinish     81
GarageQual       81
GarageCond       81
dtype: int64

In [120]:
#Checking percentage of missing values in a column
for feature in features_with_missing:
  print(feature,' ',(data[feature].isna().sum())/len(data)*100)

LotFrontage   17.73972602739726
MasVnrType   0.547945205479452
MasVnrArea   0.547945205479452
BsmtQual   2.5342465753424657
BsmtCond   2.5342465753424657
BsmtExposure   2.6027397260273974
BsmtFinType1   2.5342465753424657
BsmtFinType2   2.6027397260273974
Electrical   0.0684931506849315
FireplaceQu   47.26027397260274
GarageType   5.5479452054794525
GarageYrBlt   5.5479452054794525
GarageFinish   5.5479452054794525
GarageQual   5.5479452054794525
GarageCond   5.5479452054794525


# Filling in missing values

In [121]:
#mean_fill to fill missing values with mean
#bfill to fill missing value by moving first observed non-value backward
#ffill to fill missing value by moving the last observed non-value forward
mean_fill=['LotFrontage','MasVnrArea']
bfill=['FireplaceQu','Electrical','BsmtCond']
ffill=list(set(features_with_missing)-set(mean_fill)-set(bfill))

In [122]:
mean_fill

['LotFrontage', 'MasVnrArea']

In [123]:
for feature in mean_fill:
  data[feature].fillna(data[feature].mean(),inplace=True)

In [124]:
for feature in ffill:
  data[feature].fillna(method='ffill',inplace=True)

In [125]:
for feature in bfill:
  data[feature].fillna(method='bfill',inplace=True)

In [126]:
data.isnull().any()

MSSubClass       False
MSZoning         False
LotFrontage      False
LotArea          False
Street           False
                 ...  
MoSold           False
YrSold           False
SaleType         False
SaleCondition    False
SalePrice        False
Length: 76, dtype: bool

# Encoding the Dataset

In [127]:
#To check the datatypes of the values
data.dtypes

MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 76, dtype: object

In [128]:
categorical_data = data.select_dtypes(include=['object']).copy()
categorical_data.head(5)

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,TA,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,Gd,Typ,Gd,Detchd,Unf,TA,TA,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal


In [129]:
#Listing categorical data for computational purposes
categorical_features = list(categorical_data.columns)
categorical_features

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [130]:
#Separating the features into nominal, ordinal and numerical features
nominal=['MSZoning','LandContour','LotConfig','Neighborhood']
ordinal=list(set(categorical_features)-set(nominal))
numerical=list(set(original_features)-set(categorical_features))
target=['SalePrice']

In [131]:
data[target]

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000
...,...
1455,175000
1456,210000
1457,266500
1458,142125


In [132]:
for feature in ordinal:
  data[feature]=(data[feature].astype('category')).cat.codes

In [133]:
#Converts categorical data into dummy data
df_nominal=pd.get_dummies(data[nominal])

In [134]:
#Converts ordinal data
df_ordinal=data[ordinal]

In [135]:
#Converts numerical data
df_numerical=data[numerical]

In [136]:
# Combining nominal, ordinal and numerical data to create new dataset
encoded_data=pd.concat([df_numerical,df_nominal,df_ordinal],axis=1)

In [137]:
encoded_data

Unnamed: 0,BsmtUnfSF,LowQualFinSF,1stFlrSF,GarageArea,Fireplaces,GarageYrBlt,MiscVal,MoSold,3SsnPorch,2ndFlrSF,...,BsmtQual,SaleCondition,Foundation,BsmtCond,RoofStyle,FireplaceQu,RoofMatl,BldgType,Condition1,Functional
0,150,0,856,548,0,2003.0,0,2,0,854,...,2,4,2,3,1,4,1,0,2,6
1,284,0,1262,460,1,1976.0,0,5,0,0,...,2,4,1,3,1,4,1,0,1,6
2,434,0,920,608,1,2001.0,0,9,0,866,...,2,4,2,3,1,4,1,0,2,6
3,540,0,961,642,1,1998.0,0,2,0,756,...,3,0,0,1,1,2,1,0,2,6
4,490,0,1145,836,1,2000.0,0,12,0,1053,...,2,4,2,3,1,4,1,0,2,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,953,0,953,460,1,1999.0,0,8,0,694,...,2,4,2,3,1,4,1,0,2,6
1456,589,0,2073,500,2,1978.0,0,2,0,0,...,2,4,1,3,1,4,1,0,2,2
1457,877,0,1188,252,2,1941.0,2500,5,0,1152,...,3,4,4,1,1,2,1,0,2,6
1458,0,0,1078,240,0,1950.0,0,4,0,0,...,3,4,1,3,3,-1,1,0,2,6


# Standardize the Dataset

In [138]:
encoded_data[target]

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000
...,...
1455,175000
1456,210000
1457,266500
1458,142125


In [139]:
#Drop the target feature
X = encoded_data.drop('SalePrice',axis=1)

In [140]:
Y = data[target]
Y

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000
...,...
1455,175000
1456,210000
1457,266500
1458,142125


In [141]:
#Convert values into numpy
X = X.to_numpy()
Y = Y.to_numpy()

In [142]:
X.shape

(1460, 110)

In [143]:
Y.shape

(1460, 1)

In [213]:
#Standardize the target features
X_std = StandardScaler().fit_transform(X)
Y_std = StandardScaler().fit_transform(Y)

In [214]:
X.shape

(1460, 110)

In [215]:
Y.shape

(1460, 1)

# Performing feature extraction using PCA

In [147]:
#Extracting 3 components
pca = PCA(3)

In [216]:
#Fitting and transforming the target feature
X_pca = pca.fit_transform(X_std)
X_pca

array([[ 2.4085146 , -1.32715441],
       [ 0.43237772,  2.62476273],
       [ 2.8286253 , -0.85223939],
       ...,
       [ 1.25764877, -0.87317658],
       [-2.8278745 ,  3.18415189],
       [-0.75647481,  2.68675338]])

In [149]:
#Ratio of variance
pca.explained_variance_ratio_

array([0.10206872, 0.04345772, 0.04062114])

# Performing Feature Selection

In [187]:
reg = LassoCV(cv = 5)

In [188]:
reg.fit(X_pca,Y)

  y = column_or_1d(y, warn=True)


LassoCV(cv=5)

In [189]:
X_pca

array([[ 2.4085181 , -1.32642131, -1.04359333],
       [ 0.43237024,  2.62514079, -1.50113527],
       [ 2.82862977, -0.85193373, -0.89036497],
       ...,
       [ 1.25763915, -0.87367448,  2.71413074],
       [-2.82786855,  3.18201349, -1.44883239],
       [-0.75647364,  2.68495343, -1.35276183]])

In [190]:
Y

array([[ 0.34727322],
       [ 0.00728832],
       [ 0.53615372],
       ...,
       [ 1.07761115],
       [-0.48852299],
       [-0.42084081]])

In [191]:
#Getting the model score
reg.score(X_pca, Y)

0.7744157740285007

In [202]:
#Find L1 coefficients using regressor
pca_data = pd.DataFrame(X_pca, columns=['PC1','PC2','PC3'])
target_data = pd.DataFrame(Y, columns=['SalePrice'])
final_data = pd.concat([pca_data,target_data],axis=1)
final_data

Unnamed: 0,PC1,PC2,PC3,SalePrice
0,2.408518,-1.326421,-1.043593,0.347273
1,0.432370,2.625141,-1.501135,0.007288
2,2.828630,-0.851934,-0.890365,0.536154
3,-1.366039,0.545870,2.357711,-0.515281
4,5.312250,-0.226227,1.414212,0.869843
...,...,...,...,...
1455,1.306454,-2.289425,-0.043019,-0.074560
1456,1.095921,3.174607,0.428643,0.366161
1457,1.257639,-0.873674,2.714131,1.077611
1458,-2.827869,3.182013,-1.448832,-0.488523


In [201]:
# Get the regressor cofficients for each column
coef  = pd.Series(reg.coef_, index = pca_data.columns)
num_of_selected_features = sum(coef != 0)
num_of_selected_features

3

# Training the regression model

In [203]:
reg = LassoCV()

In [238]:
#Getting PCA with 2 components
pca = PCA(2)
X_pca = pca.fit_transform(X_std)
X_pca_data = pd.DataFrame(X_pca)

In [239]:
#Reshaping the target feature
reg.fit(X_pca_data,Y)

  y = column_or_1d(y, warn=True)


LassoCV()

In [236]:
reg.coef_

array([0.25352345, 0.0172657 ])

In [237]:
reg.intercept_

1.2409822014911733e-16