In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv("../data/train.csv")
data.head()

In [None]:
data.describe()

In [None]:
data.info()

## Feature engineering + data cleaning

In [None]:
keyFactors = [

          # Identifiant (à conserver pour référence)
        'Id',
        
        # Variables structurelles essentielles
        'OverallQual',      # Qualité générale
        'OverallCond',      # Condition générale
        'YearBuilt',        # Année de construction 
        'YearRemodAdd',     # Année de rénovation
        
        # Surfaces
        'GrLivArea',        # Surface habitable
        'TotalBsmtSF',      # Surface totale du sous-sol
        'LotArea',          # Surface du terrain
        
        # Garage et extérieur
        'GarageArea',       # Surface du garage
        'GarageCars',       # Capacité du garage en voitures
        'GarageYrBlt',      # Année de construction du garage
        'GarageType',       # Type de garage
        'GarageFinish',     # Finition du garage
        
        # Pièces et salles de bain
        'FullBath',         # Salles de bain complètes
        'HalfBath',         # Demi-salles de bain
        'BedroomAbvGr',     # Chambres au-dessus du sol
        'KitchenAbvGr',     # Cuisines au-dessus du sol
     
        # Qualité
        'KitchenQual',      # Qualité de la cuisine
        'ExterQual',        # Qualité extérieure
        'ExterCond',        # Condition extérieure
        'BsmtCond',         # Condition du sous-sol
        'HeatingQC',        # Qualité du chauffage
        
        # Localisation
        'Neighborhood',     # Quartier
        'MSZoning',         # Zonage
        
        # Caractéristiques additionnelles de valeur
        'Fireplaces',       # Nombre de cheminées
        'FireplaceQu',      # Qualité des cheminées
        'WoodDeckSF',       # Surface de la terrasse en bois
        'OpenPorchSF',      # Surface du porche ouvert
        'Foundation',       # Type de fondation
        'CentralAir',       # Climatisation centrale
        
        # Variables de vente (pour l'entraînement)

        'SaleType',         # Type de vente
        'SaleCondition',    # Condition de vente
        'MiscFeature',        # Commonditer qui ne figure pas details
        
        # target
        'SalePrice'

]

In [None]:
df = data.copy()[keyFactors]
df.set_index("Id", inplace=True)
df.head()

### Let's add some features !
we have some redundants features storing the same values and sharing the information. Those columns can be merged by selected the second column.

In [None]:
data[['Exterior1st', 'Exterior2nd', 'Condition1', 'Condition2']]

In [None]:
df.loc[:,'Exterior'] = data['Exterior2nd'].values
df.loc[:,'Condition'] = data['Condition2'].values

We can also add the house's lifespan from the build year till the purchase

In [None]:
df["Lifespan"] = np.int64(data["YrSold"] - data["YearBuilt"])
df.fillna({"LifeSpan": 0}, inplace=True) # there is no duration when the result is NA
#####
df = df[df.columns.sort_values()] # sorts the columns in alphabetic order
df.head(3)

### Data cleaning

In [None]:
# removing duplicates and checking missing values
df.drop_duplicates(inplace=True)
a = df.isna().sum()
a[a>0]

samples having **NA** values means that that feature does not exist for that house. these values won't be dropped but will be replaced by "empty" and encoded during the process.

In [None]:
## replaces the NA with "Empty"
def fill_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    temp = df.copy()
    cols_with_empty_values = a[a>0].index
    for c in cols_with_empty_values:
            temp[c] = temp[c].fillna("Empty")
    return temp

df = fill_missing_values(df)

In [None]:
# let's check values type for each sample
def check(df: pd.DataFrame):
    temp = df.dropna(axis=0)
    print("start cheking ...")
    for col in temp.columns:
        if temp[col].dtype == np.int64:
            try:
                np.int64(temp[col])
            except Exception as e:
                print(f"'{col.capitalize()}' feature should have int64 type for all samples")
        elif temp[col].dtype == np.float64:
            try:
                np.float64(temp[col])
            except Exception as e:
                print(f"'{col.capitalize()}' feature should have float64 type for all samples")
        else :
            try:
                np.object_(temp[col])
            except Exception as e:
                print(f"'{col.capitalize()}' feature should have object type for all samples")
    print("All columns are checked ")

check(df)
print(f"we have {df.shape[0]} samples and {df.shape[1]} features with the houses id set as index")

let's repeat the sama data manipulation with the test set

In [None]:
## importing data
test_data = pd.read_csv("../data/test.csv")
df_test = test_data.copy()[keyFactors[:-1]]
df_test.set_index("Id", inplace=True)

## feature engineering
df_test.loc[:,'Exterior'] = test_data['Exterior2nd'].values
df_test.loc[:,'Condition'] = test_data['Condition2'].values
df_test.loc[:,"Lifespan"] = np.int64(test_data["YrSold"] - test_data["YearBuilt"])
df_test.fillna({"LifeSpan": 0}, inplace=True) # there is no duration when the result is NA
df_test = df_test[df_test.columns.sort_values()] # sorts the columns in alphabetic order

# removing duplicates and checking missing values
df_test.drop_duplicates(inplace=True)
df_test = fill_missing_values(df_test)
check(df_test)

## Data exploration
we will display the insights and highlight how the selected features are relevant for the sale price prediction.
features are categorized into nine(9) parts:
* **Essential structural variables** : *'OverallQual'*, *'OverallCond'*, *'YearBuilt'*, *'YearRemodAdd'*
* **Surfaces** : *GrLivArea*, *TotalBsmtSF*, *LotArea*  
* **Garage and Exterior** : *GarageArea*, *GarageCars*, *GarageYrBlt*, *GarageType*, *GarageFinish*  
* **Rooms and Bathrooms** : *FullBath*, *HalfBath*, *BedroomAbvGr*, *KitchenAbvGr*  
* **Quality** : *KitchenQual*, *ExterQual*, *ExterCond*, *BsmtCond*, *HeatingQC*  
* **Location** : *Neighborhood*, *MSZoning*  
* **Additional Value Features** : *Fireplaces*, *FireplaceQu*, *WoodDeckSF*, *OpenPorchSF*, *Foundation*, *CentralAir*  
* **Sales Variables (for training)** : *SaleType*, *SaleCondition*, *MiscFeature*  
* **Created features** : *Exterior*, *LifeSpan*, *Condition*  

## ImmoSense model conception

first, let's cast our continuous or categorical data into dummy version (true or false state)

In [None]:
target = df.copy()["SalePrice"]
df.drop("SalePrice", axis=1, inplace=True)
dummy = pd.get_dummies(df)
dummy.shape

let's split it inton train and test set

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test,y_train, y_test = train_test_split(dummy, target ,test_size=0.2, shuffle=False)

let's pick the best model for our study case between:
* **Linear Regression**
* **SVR**
* **Ridge**
* **Nearest neighbors regression**
* **Decision trees**

In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
models = {
    "Linear_regression": LinearRegression(),
    "KNR": KNeighborsRegressor(),
    "SVR": SVR(kernel="linear"),
    "Ridge": Ridge(alpha=0.5),
    "Decision": DecisionTreeRegressor()
}    

In [None]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
def check_perf(model_list: dict, x_train, y_train, x_test, y_test) -> pd.DataFrame:
    rmse_tab, mae_tab, r_2_tab, score_tab = [], [], [], []
    
    for mod in model_list.values():
        mod.fit(x_train, y_train) # training thre model with training data
        y_pred = mod.predict(x_test) # prediction with samples splitted for test
        
        ## --- some metrics to evaluate model's prediction
        rmse_tab.append(round(root_mean_squared_error(y_test, y_pred),2)) 
        mae_tab.append(round(mean_absolute_error(y_test, y_pred), 2)) 
        r_2_tab.append(round(r2_score(y_test, y_pred),3))
        score_tab.append(f'{mod.score(x_train, y_train)*100}%') # training score
        
    return pd.DataFrame({
                        "RMSE": rmse_tab,
                        "Mae": mae_tab,
                        "R2": r_2_tab,
                        "Scores": score_tab
                        }, index=model_list.keys())


In [None]:
check_perf(models, x_train, y_train, x_test, y_test)

if we compare the metrics above we notice that the **Ridge_regression model** has a lower **mean error** with a quite acceptable **training score**. So **ImmoSense** model will be : **"Ridge"**. for better result, let's choose the best parameter for **ImmoSense**

In [None]:
from sklearn.model_selection import GridSearchCV
immoSense_test = Ridge()
params = {"alpha": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
gd = GridSearchCV(immoSense_test, param_grid=params, cv=5, n_jobs=-1, scoring="r2")
gd.fit(x_train, y_train)
print(f"the best alpha parameter is {gd.best_params_} with a best score trianing score of {gd.best_score_*100:.2f}% ")

In [None]:
## Training ImmoSense model with best estimator and best params
ImmoSense = Ridge(1.0)
ImmoSense.fit(x_train, y_train)
print("ImmoSense well trained")

## Final prediction
let's predict prices with the test_set file

In [None]:
df_test.ffill(inplace=True)
dummy_test = pd.get_dummies(df_test)
dummy_test = dummy_test.reindex(columns=dummy.columns, fill_value=0)
prices = ImmoSense.predict(dummy_test)
pd.DataFrame({"SalePrice": np.int64(prices)}, index=dummy_test.index).to_csv("../data/submission.csv",sep=",",header=True)
print("Submission file generated successfully !")