# Part-1: Exploratory Data Analysis
learning ref: https://www.youtube.com/watch?v=ioN1jcWxbv8; \
https://www.kaggle.com/biphili/feature-slection-lasso-ridge-linear-models \
https://scikit-learn.org/stable/auto_examples/feature_selection/plot_select_from_model_diabetes.html#sphx-glr-auto-examples-feature-selection-plot-select-from-model-diabetes-py \
https://www.kaggle.com/nitinkrsingh/advance-house-price-prediction/notebook; \
https://towardsdatascience.com/feature-selection-using-regularisation-a3678b71e499; \
https://www.kaggle.com/markct/housing-price-prediction/notebook; \
https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e

### Step-1. View and Explore the data

In [324]:
import pandas as pd
import numpy as np

dataset=pd.read_csv('/Users/suravi.mandal/train.csv')
 
# print shape of dataset with rows and columns
print(dataset.shape)

categorical_features=[feature for feature in dataset.columns if dataset[feature].dtypes=='O']
numerical_features = [feature for feature in dataset.columns if dataset[feature].dtypes != 'O']
year_feature = [feature for feature in numerical_features if 'Yr' in feature or 'Year' in feature]
discrete_feature=[feature for feature in numerical_features if len(dataset[feature].unique())<25 and feature not in year_feature+['Id']]
continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature+year_feature+['Id']]


(1460, 81)


# Part-2: Data Cleaning and Data Preparation

### step1-split the data into test data set and train data set
The advantages of splitting the data into two sets is Data Training and Data Transformation can be held on an independent set of records, this will prevent data leakage and data anomaly.

In [325]:
##split the data into test data set and train data set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(dataset,dataset['SalePrice'],test_size=0.1,random_state=0)
#splitted the data into 10% as test data and 90% as train data
print(X_train.shape) 
print(X_test.shape)

(1314, 81)
(146, 81)


## Step-2: Handle the Missing values in the feature data

### Handling the Categorical Features

In [326]:
##########For Categorical Features - replace the missing/null fields  with the label "Missing"
#view all the categorical features having missing values
features_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes=='O']
for feature in features_nan:
    print("{}: {}% missing values".format(feature,np.round(dataset[feature].isnull().mean()*100,4)))

Alley: 93.7671% missing values
MasVnrType: 0.5479% missing values
BsmtQual: 2.5342% missing values
BsmtCond: 2.5342% missing values
BsmtExposure: 2.6027% missing values
BsmtFinType1: 2.5342% missing values
BsmtFinType2: 2.6027% missing values
FireplaceQu: 47.2603% missing values
GarageType: 5.5479% missing values
GarageFinish: 5.5479% missing values
GarageQual: 5.5479% missing values
GarageCond: 5.5479% missing values
PoolQC: 99.5205% missing values
Fence: 80.7534% missing values
MiscFeature: 96.3014% missing values


In [327]:
## Replace missing value with a new label
def replace_cat_feature(dataset,features_nan):
    data=dataset.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data
dataset=replace_cat_feature(dataset,features_nan)
#after replaceing, verify no missing values are there
print(dataset[features_nan].isnull().sum())

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64


### Handling the Categorical Features

In [328]:
#view all the categorical features having missing values
numerical_with_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes!='O']
for feature in numerical_with_nan:
    print("{}: {}% missing value".format(feature,np.around(dataset[feature].isnull().mean()*100,4)))

LotFrontage: 17.7397% missing value
MasVnrArea: 0.5479% missing value
GarageYrBlt: 5.5479% missing value


In [329]:
## Replacing the numerical Missing Values

for feature in numerical_with_nan:
    ## We will replace by using median since there are outliers
    median_value=dataset[feature].median()
    
    ## create a new feature to capture nan values
    dataset[feature+'nan']=np.where(dataset[feature].isnull(),1,0)
    dataset[feature].fillna(median_value,inplace=True)

# Handling the Temporal features

In [330]:
## Temporal Variables (Date Time Variables)
## It is more important to learn how many year since the House was sold, so we convert from year stamp to mention how many year 
## by simply subtracting these dated with YearSold


#for feature in year_feature:
for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
       
    dataset[feature]=dataset['YrSold']-dataset[feature]

# Handle Skewed Numerical Features

In [333]:
import numpy as np
num_features= ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']

for feature in continuous_feature:
    dataset[feature]=np.log(dataset[feature])

In [334]:
##remove categorical variables that are present less than 1% of the observations
for feature in categorical_features:
    temp=dataset.groupby(feature)['SalePrice'].count()/len(dataset)
    temp_df=temp[temp>0.01].index
    dataset[feature]=np.where(dataset[feature].isin(temp_df),dataset[feature],'Rare_var')

In [335]:
for feature in categorical_features:
    labels_ordered=dataset.groupby([feature])['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    dataset[feature]=dataset[feature].map(labels_ordered)

In [336]:
#drop columns which have more than 90% missing values.
dataset.drop(['Alley','PoolQC','MiscFeature'], axis=1, inplace=True)

# Feature Scaling
Feature scaling is a method used to normalize the range of independent variables or features of data. In data processing, it is also known as data normalization and is generally performed during the data preprocessing step

In [337]:
scaling_feature=[feature for feature in dataset.columns if feature not in ['Id','SalePerice'] ]
#len(scaling_feature)
#to check how many independednt features are there to be scaled
#print('total number of scale features' , len(scaling_feature))
#print(scaling_feature)

In [338]:
feature_scale=[feature for feature in dataset.columns if feature not in ['Id','SalePrice']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(dataset[feature_scale])

MinMaxScaler()

In [339]:
scaler.transform(dataset[feature_scale])

array([[0.23529412, 0.75      , 0.55289564, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.75      , 0.62690092, ..., 0.        , 0.        ,
        0.        ],
       [0.23529412, 0.75      , 0.56949989, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.29411765, 0.75      , 0.55854802, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.75      , 0.56949989, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.75      , 0.60454218, ..., 0.        , 0.        ,
        0.        ]])

# Data Transform

In [340]:
# transform the train and test dataset into normalised form, and add on the Id and SalePrice variables
data = pd.concat([dataset[['Id', 'SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(dataset[feature_scale]), columns=feature_scale)],
                    axis=1)
#data.head(5)

### Export the tranformed data into X_train data set

In [341]:
data.to_csv('/Users/suravi.mandal/X_train.csv',index=False)

In [342]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

## for feature slection

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [343]:
transformed_dataset=pd.read_csv('/Users/suravi.mandal/PycharmProjects/pythonProject_test-house-price/X_train.csv')
print(transformed_dataset.shape)
transformed_dataset.head()

(1460, 84)


Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontageNaN,MasVnrAreaNaN,GarageYrBltNaN
0,1,12.247694,0.235294,0.75,0.418208,0.366344,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.636364,0.4,1.0,0.75,1.0,0.666667,0.5,0.036765,0.098361,0.0,0.0,1.0,1.0,0.5,0.1225,0.666667,1.0,1.0,0.75,0.75,0.25,1.0,0.125089,0.833333,0.0,0.064212,0.140098,1.0,1.0,1.0,1.0,0.356155,0.413559,0.0,0.577712,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.5,1.0,0.0,0.2,0.8,0.046729,0.666667,0.5,0.38646,0.666667,1.0,1.0,0.0,0.111517,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.090909,0.5,0.666667,0.75,0.0,0.0,0.0
1,2,12.109011,0.0,0.75,0.495064,0.391317,1.0,1.0,0.0,0.333333,1.0,0.5,0.0,0.5,0.2,1.0,0.75,0.6,0.555556,0.875,0.227941,0.52459,0.0,0.0,0.4,0.3,0.25,0.0,0.333333,1.0,0.5,0.75,0.75,1.0,0.666667,0.173281,0.833333,0.0,0.121575,0.206547,1.0,1.0,1.0,1.0,0.503056,0.0,0.0,0.470245,0.0,0.5,0.666667,0.0,0.375,0.333333,0.333333,0.333333,1.0,0.333333,0.6,0.8,0.28972,0.666667,0.5,0.324401,0.666667,1.0,1.0,0.347725,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.363636,0.25,0.666667,0.75,0.0,0.0,0.0
2,3,12.317167,0.235294,0.75,0.434909,0.422359,1.0,1.0,0.333333,0.333333,1.0,0.0,0.0,0.636364,0.4,1.0,0.75,1.0,0.666667,0.5,0.051471,0.114754,0.0,0.0,1.0,1.0,0.5,0.10125,0.666667,1.0,1.0,0.75,0.75,0.5,1.0,0.086109,0.833333,0.0,0.185788,0.150573,1.0,1.0,1.0,1.0,0.383441,0.41937,0.0,0.593095,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.333333,1.0,0.333333,0.6,0.8,0.065421,0.666667,0.5,0.428773,0.666667,1.0,1.0,0.0,0.076782,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.727273,0.5,0.666667,0.75,0.0,0.0,0.0
3,4,11.849398,0.294118,0.75,0.388581,0.390295,1.0,1.0,0.333333,0.333333,1.0,0.25,0.0,0.727273,0.4,1.0,0.75,1.0,0.666667,0.5,0.669118,0.606557,0.0,0.0,0.2,0.4,0.25,0.0,0.333333,1.0,0.25,0.5,1.0,0.25,0.666667,0.038271,0.833333,0.0,0.231164,0.123732,1.0,0.75,1.0,1.0,0.399941,0.366102,0.0,0.579157,0.333333,0.0,0.333333,0.0,0.375,0.333333,0.666667,0.416667,1.0,0.333333,0.8,0.4,0.074766,0.333333,0.75,0.45275,0.666667,1.0,1.0,0.0,0.063985,0.492754,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.090909,0.0,0.666667,0.0,0.0,0.0,0.0
4,5,12.429216,0.235294,0.75,0.513123,0.468761,1.0,1.0,0.333333,0.333333,1.0,0.5,0.0,1.0,0.4,1.0,0.75,1.0,0.777778,0.5,0.058824,0.147541,0.0,0.0,1.0,1.0,0.5,0.21875,0.666667,1.0,1.0,0.75,0.75,0.75,1.0,0.116052,0.833333,0.0,0.20976,0.187398,1.0,1.0,1.0,1.0,0.466237,0.509927,0.0,0.666523,0.333333,0.0,0.666667,0.5,0.5,0.333333,0.666667,0.583333,1.0,0.333333,0.6,0.8,0.074766,0.666667,0.75,0.589563,0.666667,1.0,1.0,0.224037,0.153565,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.5,0.666667,0.75,0.0,0.0,0.0


## Export the tranformed data into Y_train data set

In [344]:
## Capture the dependent feature in y_train dataset
y_train=transformed_dataset[['SalePrice']]

In [345]:
## drop dependent feature from X_train dataset
X_train=transformed_dataset.drop(['Id','SalePrice'],axis=1)

In [None]:
#y_train.shape, X_train.shape

# Part-3 Feature Selection

In [346]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

## for feature slection

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel


### Apply Feature Selection with the Lasso Regression model, to select a suitable alpha (equivalent of penalty),he bigger the alpha the less features that will be selected.
### Use the selectFromModel object from sklearn, which will select the features for which coefficients are non-zero


feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # to apply the same random state to other filed
feature_sel_model.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))

In [347]:
#Visualising features that were kept by the lasso regularisation
feature_sel_model.get_support()

array([ True,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True,  True,  True, False,  True,  True, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False])

In [348]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler


# make a list of the selected features
selected_feat = X_train.columns[(feature_sel_model.get_support())]
#print('test123',sel_.estimator_)

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))


total features: 82
selected features: 21


In [349]:
selected_feat

Index(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'YearRemodAdd',
       'RoofStyle', 'BsmtQual', 'BsmtExposure', 'HeatingQC', 'CentralAir',
       '1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'KitchenQual', 'Fireplaces',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive',
       'SaleCondition'],
      dtype='object')

In [350]:
X_train=X_train[selected_feat]

In [351]:
print('total number of selected features',X_train.shape)

total number of selected features (1460, 21)


In [352]:
import csv
import json
csvFilePath = '/Users/suravi.mandal/selected.csv'
jsonFilePath = '/Users/suravi.mandal/'
# Function to convert a CSV to JSON
# Takes the file paths as arguments
def make_json(csvFilePath, jsonFilePath):
	
	# create a dictionary
	data = {}
	
	# Open a csv reader called DictReader
	with open(csvFilePath, encoding='utf-8') as csvf:
		csvReader = csv.DictReader(csvf)
		
		# Convert each row into a dictionary
		# and add it to data
		for rows in csvReader:
			
			# Assuming a column named 'No' to
			# be the primary key
			key = rows['Id']
			data[key] = rows

	# Open a json writer, and use the json.dumps()
	# function to dump data
	with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
		jsonf.write(json.dumps(data, indent=4))
		
# Driver Code

# Decide the two file paths according to your
# computer system
csvFilePath = '/Users/suravi.mandal/selected.csv'
jsonFilePath = '/Users/suravi.mandal/selectedJsonObject.json'

# Call the make_json function
make_json(csvFilePath, jsonFilePath)
