#### 1. Import all the necessary modules:

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor

#### 2. Load datasets:

In [14]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

#### 3. Inspect the datasets
Take a closer look on types of features and missing data.

In [15]:
# Remove the limitation to see all the rows
pd.set_option('display.max_rows', None)

# Inspect the dataset
print(train_df.describe())
print(train_df.info())

                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000  ...   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726  ...   
std       1.112799    30.202904     20.645407   181.066207   456.098091  ..

It is noticeable that not all the data is numeric. To increase computational speed and avoid errors while using scikit-learn library it will be needed to **convert non-numeric data into numeric**.

In [18]:
count_missing1 = train_df.isnull().sum()
count_missing2 = test_df.isnull().sum()

# Display all the columns which contain missing values
print(count_missing1[count_missing1!=0])
print('\n')
print(count_missing2[count_missing2!=0])

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        16
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageCars         1
GarageArea         1
GarageQual        7

#### 4. Handling missing values
According to the data description, all the columns, which have missing values (except from LotFrontage, Electrical, MaZoning, Utilities, KitchenQual, Functional, SaleType), are supposed to have ones - it means that there is simply no certain facility (like pool). It was decided to handle missing date the following way:
> **Electrical, MaZoning, Utilities, KitchenQual, Functional, SaleType** - drop the rows (due to very insignificant number of records)

> **LotFrontage** - fill missing values with mean value

> **Other** - replace NA with "None for further manipulations

> **Numeric colums linked to "Other"** - replace NA with 0

In [19]:
# Drop the missing values
train_df.dropna(subset=['Electrical', 'MSZoning', 'Utilities', 'KitchenQual', 'Functional', 'SaleType']) 
test_df.dropna(subset=['Electrical', 'MSZoning', 'Utilities', 'KitchenQual', 'Functional', 'SaleType']) 

# Fill the missing values with mean
train_df['LotFrontage']=train_df['LotFrontage'].fillna(train_df['LotFrontage'].mean())
test_df['LotFrontage']=test_df['LotFrontage'].fillna(test_df['LotFrontage'].mean())

# Fill missing values with "None" if column is non-numeric
#                     with 0 if column is numeric
for col in train_df.columns:
    
    # Check if the column is of object type
    if train_df[col].dtypes == 'object':
        train_df[col] = train_df[col].fillna('None')
        test_df[col]= test_df[col].fillna('None')
        
    # Check if the column is of int or float type
    elif (train_df[col].dtypes == 'int64' or train_df[col].dtypes == 'float64') and (col != 'SalePrice'):
        train_df[col] = train_df[col].fillna(0)
        test_df[col]= test_df[col].fillna(0)

# Display the sum of missing values to chech if all of them are handled
print(train_df.isnull().sum())
print('\n')
print(test_df.isnull().sum())


Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr

#### 5. Encode labels
This step is needed to convert non-numeric data into numeric.

In [20]:
for col in train_df.columns.to_numpy():
    
    # If column is not numeric
    if train_df[col].dtypes=='object':
        train_cat = train_df[col].unique().tolist()
        test_cat = test_df[col].unique().tolist()
        
        # Combine categories from both train and prediction sets
        categories = list(set(train_cat+test_cat))
        
        # Create a dictionary with category name and numeric equivalent
        labels = [i for i in range(1, len(categories)+1)]
        pair = {a[0]:a[1] for a in zip(categories, labels)}
        
        # Encode labels from datasets
        train_df[col]=train_df[col].map(pair)
        test_df[col]=test_df[col].map(pair)

#### 6. Fit the model
To increase the perfomance of model it was decided to choose gradient boosting regressor, which oprimizes the loss function and uses decision tree as a week learner.

In [24]:
# Split features and target
X = train_df.loc[: , ~train_df.columns.isin(['SalePrice', 'Id'])]
y = train_df['SalePrice']

#Initialize the regressor and fit it
reg = GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2, learning_rate = 0.1, loss = 'ls')
reg.fit(X, y)

GradientBoostingRegressor(max_depth=5, n_estimators=400)

#### 7. Evaluate  performance
It was decided to evaluate model's work by it's **accuracy**. However, the accuracy changes according to how the data is distributed, that us why 5-fold cross-validation is used. The score is the mean of 5 reslts.

In [25]:
# Calculate the cross-validation score
cv_score = cross_val_score(reg, X, y, cv=5)
print(np.mean(cv_score))

0.8773024073972678


#### 8. Make predictions and save the result

In [26]:
# Predict the values for test set
test_df['SalePrice'] = reg.predict(test_df.loc[: , test_df.columns != 'Id'])

# Create a csv file for the prediction
prediction = test_df[['Id', 'SalePrice']]
prediction.to_csv("prediction.csv")