In [None]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets
import torchvision
from torch.utils.data import DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
import torch.optim as optim
import matplotlib.pyplot as plt
import time

Reading and observing the dataset at hand

In [None]:
ls

In [None]:
df = pd.read_csv('house prices.csv')
df.drop('Id', inplace=True, axis=1)

In [None]:
df.head(3)

In [None]:
df.info()

In [None]:
def show_nans():
    nan_df = pd.DataFrame(df.isna().sum(axis=0), columns=['nan-count'])
    nan_df = nan_df.sort_values('nan-count', ascending=False)
    return nan_df

Removing Nan values if possible and replacing them with the Nan values defined specifically for each

In [None]:
df['PoolQC'].fillna('NA', inplace=True)
df['MiscFeature'].fillna('NA', inplace=True)
df['Alley'].fillna('NA', inplace=True)
df['Fence'].fillna('NA', inplace=True)
df['FireplaceQu'].fillna('NA', inplace=True)
df['GarageCond'].fillna('NA', inplace=True)
df['GarageFinish'].fillna('NA', inplace=True)
df['GarageQual'].fillna('NA', inplace=True)
df['GarageType'].fillna('NA', inplace=True)
df['BsmtFinType2'].fillna('NA', inplace=True)
df['BsmtFinType1'].fillna('NA', inplace=True)
df['BsmtExposure'].fillna('NA', inplace=True)
df['BsmtCond'].fillna('NA', inplace=True)
df['BsmtQual'].fillna('NA', inplace=True)
df['LotFrontage'].fillna(df['LotFrontage'].mean(), inplace=True) # should be experimented
show_nans()

In [None]:
df.drop('GarageYrBlt', axis=1, inplace=True)

In [None]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.shape

In [None]:
number_df = df.select_dtypes('number')
object_df = df.select_dtypes('object')

In [None]:
number_df.shape, object_df.shape

In [None]:
object_df.head(3)

In [None]:
number_df.head(3)

In [None]:
for column in object_df.columns:
    object_df[column] = pd.factorize(object_df[column])[0]

In [None]:
df = pd.concat([number_df, object_df], axis=1)

In [None]:
sns.histplot(np.log(df['SalePrice']))

In [None]:
def scaler_features(df, log_target=False):
    df_copy = df.copy()
    df_copy_target = df_copy.pop('SalePrice')
    if log_target:
        df_copy_target = np.log1p(df_copy_target)
    df_copy_target.reset_index(drop=True, inplace=True)
    scaler = MinMaxScaler()
    df_copy = pd.DataFrame(scaler.fit_transform(df_copy), columns=df_copy.columns)
    df_copy = pd.concat([df_copy, df_copy_target], axis=1)
    return df_copy

### First Part

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(df.corr(), mask=np.triu(df.corr()), cmap='Blues')

In [None]:
d = scaler_features(df, log_target=True)

In [None]:
y_df = d.pop('SalePrice')
X_df = d

In [None]:
X_df

In [None]:
y_df

Linear regression model

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_df, y_df)

In [None]:
intercept = regressor.intercept_
features = pd.DataFrame(regressor.coef_, X_df.columns, columns=['coefficient'])
features.sort_values('coefficient', ascending=False).head()

In [None]:
features.coefficient = features.coefficient.abs()
stdevs = []
for i in X_df.columns:
    stdev = d[i].std()
    stdevs.append(stdev)

features["stdev"] = np.array(stdevs).reshape(-1,1)
features["importance"] = features["coefficient"] * features["stdev"]

In [None]:
features['importance_normalized'] = 100*features['importance'] / features['importance'].max()

In [None]:
features = features.sort_values('importance_normalized', ascending=False).head(10)

In [None]:
plt.figure(figsize=(15, 10))
plt.bar(features.index, features.importance_normalized)

Decision Tree model

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(max_depth=5)
dtr.fit(X_df, y_df)

In [None]:
feature_importance = pd.Series(dtr.tree_.compute_feature_importances(), index=X_df.columns)

In [None]:
feature_importance = feature_importance.sort_values(ascending=False).head(10)

In [None]:
plt.figure(figsize=(15, 10))
plt.bar(feature_importance.index, feature_importance.values)

---

### Backward Elimination

In [None]:
bias = pd.Series(np.ones(X_df.shape[0]), name='bias')
X_df = pd.concat([X_df, bias], axis=1)

In [None]:
import statsmodels.api as sm

In [None]:
start_time = time.time()
X_dff = X_df.copy()
p_value_threshold = 0.05
non_usefull_feature = []
while True: 
    model = sm.OLS(y_df, X_dff)
    results = model.fit()
    highest_p_value = -np.inf
    worst_feature = None
    for i in range(len(results.pvalues)):
        if results.pvalues[i] > p_value_threshold and results.pvalues[i] > highest_p_value:
            worst_feature = results.pvalues.index[i]
            highest_p_value = results.pvalues[i]
    if not worst_feature:
        print(results.pvalues)
        break
    print('removing {} feature'.format(worst_feature))
    non_usefull_feature.append(worst_feature)
    X_dff.drop(worst_feature, axis=1, inplace=True)
end_time = time.time()
print('Time took for the operation of backward elimination: {}s'.format(np.round(end_time - start_time, 3)))

### Model

In [None]:
def to_dataloader(df, target_col, batch_size):
    target = torch.tensor(df[target_col].values.astype(np.float32))
    data = torch.tensor(df.drop(target_col, axis=1).values.astype(np.float32))

    data_tensor = torch.utils.data.TensorDataset(data, target)
    data_loader = DataLoader(data_tensor, shuffle=True, batch_size=batch_size)
    return data_loader

Definition of the model

In [None]:
class Regressor_nn(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers, output_size, activation_function):
        super(Regressor_nn, self).__init__()
        self.activation_function = activation_function
        self.fcs = nn.ModuleList()
        self.n_layers = n_layers
        self.fcs.append(nn.Linear(input_size, hidden_size[0]))
        for i in range(n_layers - 1):
            self.fcs.append(nn.Linear(hidden_size[i], hidden_size[i + 1]))
        self.fcs.append(nn.Linear(hidden_size[-1], output_size))
    def forward(self, x):
        for i in range(n_layers):
            x = self.activation_function(self.fcs[i](x))
        x = self.fcs[-1](x)
        x = x.squeeze(1)
        return x

In [None]:
def train(model, train_loader, test_loader, criterion_mse, criterion_mae, optimizer):
    start_time = time.time()
    epochs_loss_mse = []
    epochs_loss_mae = []
    epochs_loss_mse_test = []
    epochs_loss_mae_test = []
    for epoch in tqdm(range(n_epochs), leave=False):
        batchs_loss_mse = []
        batchs_loss_mae = []
        for batch_idx, (data, targets) in enumerate(train_loader):
            outputs = model(data)
            mse = criterion_mse(outputs, targets)
            optimizer.zero_grad()
            mse.backward()
            optimizer.step()
            
            with torch.no_grad():
                mae = criterion_mae(outputs, targets)
            batchs_loss_mse.append(mse.item())
            batchs_loss_mae.append(mae.item())
        
        epochs_loss_mse.append(np.mean(batchs_loss_mse))
        epochs_loss_mae.append(np.mean(batchs_loss_mae))
        batchs_loss_mse_test, batchs_loss_mae_test, _, _ = check_accuracy(model, test_loader, criterion_mse, criterion_mae)
        epochs_loss_mse_test.append(batchs_loss_mse_test)
        epochs_loss_mae_test.append(batchs_loss_mae_test)
    end_time = time.time()
    print('Time took for the operation of training: {}s'.format(np.round(end_time - start_time, 3)))
    return epochs_loss_mse, epochs_loss_mae, epochs_loss_mse_test, epochs_loss_mae_test

In [None]:
def check_accuracy(model, loader, criterion_mse, criterion_mae):
    model.eval()
    outputs_agg = np.array([])
    targets_agg = np.array([])
    with torch.no_grad():
        batchs_loss_mse = []
        batchs_loss_mae = []
        for batch_idx, (data, targets) in enumerate(loader):
            outputs = model(data)
            outputs_agg = np.append(outputs_agg, outputs.numpy())
            targets_agg = np.append(targets_agg, targets.numpy())
            mse = criterion_mse(outputs, targets)
            mae = criterion_mae(outputs, targets)
            batchs_loss_mse.append(mse.item())     
            batchs_loss_mae.append(mae.item())
    print('mse error :{}, mae error :{}'.format(mse.item(), mae.item()))
    model.train()
    return np.mean(batchs_loss_mse), np.mean(batchs_loss_mae), outputs_agg, targets_agg

In [None]:
def plot_losses(epochs_loss_mse, epochs_loss_mae, epochs_loss_mse_test, epochs_loss_mae_test, title):
    fig, axs = plt.subplots(1, 2, figsize=(15, 5))
    fig.suptitle(title)
    sns.lineplot(data=epochs_loss_mse, label='train data', ax=axs[0])
    sns.lineplot(data=epochs_loss_mse_test, label='test data', ax=axs[0])
    axs[0].grid()
    axs[0].set_title('MSE loss per epoch')
    sns.lineplot(data=epochs_loss_mae, label='train data', ax=axs[1])
    sns.lineplot(data=epochs_loss_mae_test, label='test data', ax=axs[1])
    axs[1].grid()
    axs[1].set_title('MAE loss per epoch')
    plt.tight_layout()

In [None]:
def plot_predictions(outputs_agg, targets_agg):
    fig = plt.figure(figsize=(10, 10))
    sns.scatterplot(y=outputs_agg, x=targets_agg)
    plt.ylabel('predictions')
    plt.xlabel('targets')
    plt.ylim((np.min(targets_agg), np.max(targets_agg)))

In [None]:
batch_size = 64
dff = scaler_features(df, log_target=True)
train_df = dff.sample(frac=.8, random_state=707)
test_df = dff.drop(train_df.index)

train_loader = to_dataloader(
    df=train_df, 
    target_col='SalePrice', 
    batch_size=batch_size
)
test_loader = to_dataloader(
    df=test_df, 
    target_col='SalePrice', 
    batch_size=batch_size
)

In [None]:
input_size = dff.shape[1] - 1
n_layers = 4
hidden_size = [100, 64, 32, 16]
activation_function = F.relu
output_size = 1
lr = 0.001
n_epochs = 100

In [None]:
model = Regressor_nn(
    input_size=input_size, 
    hidden_size=hidden_size, 
    n_layers=n_layers, 
    output_size=output_size, 
    activation_function=activation_function
)

criterion_mse = nn.MSELoss()
criterion_mae = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr)
epochs_loss_mse, epochs_loss_mae, epochs_loss_mse_test, epochs_loss_mae_test = train(
    model, 
    train_loader,
    test_loader,
    criterion_mse,
    criterion_mae,
    optimizer
)
plot_losses(
    epochs_loss_mse, 
    epochs_loss_mae,
    epochs_loss_mse_test,
    epochs_loss_mae_test,
    'MSE, {}, and {} layers used for training'.format(str(activation_function).split()[1], str(len(hidden_size)))
)
_, _, outputs_agg, targets_agg = check_accuracy(model, test_loader, criterion_mse, criterion_mae)
plot_predictions(outputs_agg, targets_agg)

In [None]:
batch_size = 64
dff = df.drop(non_usefull_feature, axis=1)
dff = scaler_features(dff, log_target=True)
train_df = dff.sample(frac=.8, random_state=707)
test_df = dff.drop(train_df.index)

train_loader = to_dataloader(
    df=train_df, 
    target_col='SalePrice', 
    batch_size=batch_size
)
test_loader = to_dataloader(
    df=test_df, 
    target_col='SalePrice', 
    batch_size=batch_size
)

In [None]:
input_size = dff.shape[1] - 1
n_layers = 4
hidden_size = [100, 64, 32, 16]
activation_function = F.relu
output_size = 1
lr = 0.001
n_epochs = 100

In [None]:
model = Regressor_nn(
    input_size=input_size, 
    hidden_size=hidden_size, 
    n_layers=n_layers, 
    output_size=output_size, 
    activation_function=activation_function
)

criterion_mse = nn.MSELoss()
criterion_mae = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr)
epochs_loss_mse, epochs_loss_mae, epochs_loss_mse_test, epochs_loss_mae_test = train(
    model, 
    train_loader,
    test_loader,
    criterion_mse,
    criterion_mae,
    optimizer
)
plot_losses(
    epochs_loss_mse, 
    epochs_loss_mae,
    epochs_loss_mse_test,
    epochs_loss_mae_test,
    'MSE, {}, and {} layers used for training'.format(str(activation_function).split()[1], str(len(hidden_size)))
)
_, _, outputs_agg, targets_agg = check_accuracy(model, test_loader, criterion_mse, criterion_mae)
plot_predictions(outputs_agg, targets_agg)