# Project - House Prices: Advanced Regression Techniques

The main aim of this project is to predict the house price based on various features

## The Lifecycle of a Data Science Project
1. Data Analysis
2. Feature Engineering
3. Feature Selection
4. Model Building
5. Model Deployment

In [None]:
# Import Libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
# Load Data

def read_data(data_dir):
    
    dataset = pd.read_csv(data_dir)
    return dataset

In [None]:
dataset = read_data('train.csv')

In [None]:
# Inspect Dataset
print("Dataset's Shape is: ", dataset.shape)

In [None]:
# Inspect the top 5 rows
dataset.head()

## Step 1: Data Analysis

### Steps to consider during a Data Analysis Project
1. Missing Values
2. All The Numerical Variables
3. Distribution of the Numerical Variables
4. Categorical Variables
5. Cardinality of Categorical Variables
6. Outliers
7. Relationship between independent and dependent feature(SalePrice)

#### 1. Missing Values

In [None]:
# Percentage of Missing Values per Column (column must have NA values)
# Step 1: Get all columns with null values
columns_with_na = [column for column in dataset.columns if dataset[column].isna().sum() >= 1]
# Step 2: Get the percentage
for col in columns_with_na:
    print(col, round(dataset[col].isna().sum()/len(dataset),4)," % NULL Values")

### Step 1: Missing Values - Map the relationship between NULL values an the target variable SalePrice

In [None]:
for column in columns_with_na:
    
    data = dataset.copy()
    # Each column is mapped as either 1 or 0. 1 for NULLS and 0 for existing observations.
    data[column] = np.where(data[column].isna(), 1, 0)
    
    # Calculate the mean SalePrice
    data.groupby(column)['SalePrice'].mean().plot.bar()
    plt.title(column)
    plt.show()

### Step 2. Numerical Values

In [None]:
# numerical cols is a pandas Dataframe

numerical_cols = dataset.select_dtypes(include = 'number')

numerical_cols.head()

In [None]:
# Date cols is a list

date_cols = [column for column in list(numerical_cols.columns) if 'Yr' in column or 'Year' in column]

date_cols

In [None]:
for col in date_cols:
    
    mean_price = dataset.groupby(col)['SalePrice'].mean()
    mean_price.plot()
    plt.xlabel(col)
    plt.ylabel('SalePrice')
    plt.show()

### 2.a. Numerical Values - Discrete Variables

In [None]:
numerical_disc_col = [col for col in list(numerical_cols.columns) if len(dataset[col].unique()) < 25 and col not in date_cols]
print("The number of discrete numerical columns is: ", len(numerical_disc_col))

In [None]:
numerical_disc_col

In [None]:
# Relationship between Discrete Numerical Cols and SalePrice

for col in numerical_disc_col:
    data=dataset.copy()
    data.groupby(col)['SalePrice'].mean().plot.bar()
    plt.xlabel(col)
    plt.ylabel('SalePrice')
    plt.title(col)
    plt.show()

### 2.b. Numerical Values - Continuous Variables

In [None]:
numerical_cont_col = [col for col in list(numerical_cols.columns) if len(dataset[col].unique()) >= 25 and col not in date_cols]
print("The number of continuous numerical columns is: ", len(numerical_cont_col))

In [None]:
numerical_cont_col

In [None]:
# Relationship between Continuous Numerical Cols and SalePrice

for col in numerical_cont_col:
    data=dataset.copy()
    data[col].hist(bins = 25)
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.title(col)
    plt.show()

### Step 3. Categorical Values

In [None]:
categorical_cols = [col for col in dataset.columns if dataset[col].dtypes=='O']
categorical_cols

In [None]:
for col in categorical_cols:
    print("The number of categorical values for the column: ", col, " is", len(dataset[col].unique()))

In [None]:
for col in categorical_cols:
    
    data = dataset.copy()
    data.groupby(col)['SalePrice'].mean().plot.bar()
    plt.xlabel(col)
    plt.ylabel('SalePrice')
    plt.title(col)
    plt.show()

### Step 4. Logarithmic Transformation

In [None]:
## We will be using logarithmic transformation


for col in numerical_cont_col:
    data=dataset.copy()
    if 0 in data[col].unique():
        pass
    else:
        data[col]=np.log(data[col])
        data['SalePrice']=np.log(data['SalePrice'])
        plt.scatter(data[col],data['SalePrice'])
        plt.xlabel(col)
        plt.ylabel('SalesPrice')
        plt.title(col)
        plt.show()
        

### Step 5. Outliers

In [None]:
for col in numerical_cont_col:
    data=dataset.copy()
    if 0 in data[col].unique():
        pass
    else:
        data[col]=np.log(data[col])
        data.boxplot(column=col)
        plt.ylabel(col)
        plt.title(col)
        plt.show()
        
    

## Feature Engineering

We will be performing all the below steps in Feature Engineering

1. Missing values
2. Temporal variables
3. Categorical variables: remove rare labels
4. Standarise the values of the variables to the same range

In [None]:
# Data leakage is possible so we need to split the data first and then apply Feature Engineering


X_train,X_test,y_train,y_test=train_test_split(dataset,dataset['SalePrice'],test_size=0.1,random_state=0)

In [None]:
X_train.shape, X_test.shape

### Missing Values - Categorical Variables

In [None]:
# Categorical Values
categorical_features_nan = [feature for feature in categorical_cols if dataset[feature].isnull().sum() > 1]

In [None]:
for feature in categorical_features_nan:
    print("{}: {}% missing values".format(feature, np.round(dataset[feature].isnull().sum()/len(dataset),4)))

In [None]:
def replace_cat_feature(dataset,features_nan):
    data=dataset.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data

dataset=replace_cat_feature(dataset,categorical_features_nan)

dataset[categorical_features_nan].isnull().sum()

In [None]:
dataset.head()

### Missing Values - Numerical Variables

In [None]:
num_features_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum() > 1 and feature not in date_cols]

for feature in num_features_nan:
    print("{}: {}% missing values".format(feature, np.round(dataset[feature].isnull().sum()/len(dataset),4)))

In [None]:
def replace_num_feature(dataframe,features_nan):
    
    data = dataset.copy()
    median_data_features_nan = data[features_nan].median()
    data[features_nan] = data[features_nan].fillna(median_data_features_nan)
    return data

dataset = replace_num_feature(dataset, num_features_nan)
dataset[num_features_nan].isnull().sum()

### Numerical Variables - Log Transformations

In [None]:
num_wout_zeros = [feature for feature in dataset.columns if not dataset[feature].isin([0]).any() and dataset[feature].dtype != 'O']

In [None]:
num_wout_zeros

In [None]:
for feature in num_wout_zeros:
    dataset[feature]=np.log(dataset[feature])

## Feature Selection

In [None]:
y_train=dataset[['SalePrice']]

In [None]:
## drop dependent feature from dataset
X_train=dataset.drop(['Id','SalePrice'],axis=1)

In [None]:
### Apply Feature Selection
# first, I specify the Lasso Regression model, and I
# select a suitable alpha (equivalent of penalty).
# The bigger the alpha the less features that will be selected.

# Then I use the selectFromModel object from sklearn, which
# will select the features which coefficients are non-zero

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X_train, y_train)

In [None]:
feature_sel_model.get_support()

In [None]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feat = X_train.columns[(feature_sel_model.get_support())]

# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

In [None]:
selected_feat

In [None]:
X_train=X_train[selected_feat]

In [None]:
X_train.head()