In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## import the library 

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow_decision_forests as tfdf

# EDA

In [None]:
train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
train

In [None]:
train= train.drop('Id', axis=1)
train.head(3)

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
train.SalePrice.describe()

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Data Visualization

In [None]:
na_columns = [col for col in train.columns if train[col].isnull().sum() > 0]
n_miss = train[na_columns].isnull().sum().sort_values(ascending=False)
ratio = (train[na_columns].isnull().sum() / train.shape[0] * 100).sort_values(ascending=False)
missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=["n_miss", "ratio"])
missing_df

In [None]:
sns.set(style="whitegrid")  # Set the style
plt.figure(figsize=(8, 6))  # Set the figure size
sns.barplot(y=missing_df.index, x=missing_df['n_miss'], palette="pastel")

In [None]:
sns.distplot(train['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4})

In [None]:
sns.scatterplot(train['SalePrice'])

In [None]:
sns.histplot(train['SalePrice'])

Deviate from the normal distribution

Have appreciable positive skewness

In [None]:
train['SalePrice'].skew()

In [None]:
train['SalePrice'].kurt()

In [None]:
list(set(train.dtypes.tolist()))

In [None]:
df_num = train.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
k=df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)

In [None]:
plt.scatter(df_num['SalePrice'],df_num['GrLivArea'])

In [None]:
plt.scatter(df_num['SalePrice'],df_num['TotalBsmtSF'])

**'GrLivArea' and 'TotalBsmtSF' seem to be linearly related with 'SalePrice'. Both relationships are positive, which means that as one variable increases, the other also increases**

In [None]:
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(df_num.corr() ,vmax=.8, square=True);

**The first one refers to the 'TotalBsmtSF' and '1stFlrSF' variables, and the second one refers to the 'GarageYrBlt' and 'Garagecars' variables. Both cases show how significant the correlation is between these variables. Actually, this correlation is so strong that it can indicate a situation of multicollinearity** 

**The variables 'OverallQual' and 'GrLivArea' are strongly correlated with the 'SalePrice' of houses.**

In [None]:
df_num.columns

In [None]:
corr_fea=df_num.corr().nlargest(10, 'SalePrice')['SalePrice']
corr_fea

In [None]:
cols=df_num.corr().nlargest(10, 'SalePrice')['SalePrice'].index
cols

In [None]:
k=train[cols].values.T
np.corrcoef(k)

In [None]:
sns.heatmap(df_num[cols].corr(),annot=True)

In [None]:
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train[cols], size = 2.5)

## Handling Missing Data

In [None]:
train.isnull().sum().sort_values(ascending=False)

In [None]:
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

# TensorFlow Decision Tree(TFDF) model

In [None]:

def split_dataset(dataset, test_ratio=0.30):
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

train_ds_pd, valid_ds_pd = split_dataset(train)
print("{} examples in training, {} examples in testing.".format(
    len(train_ds_pd), len(valid_ds_pd)))

**We need to convert the datatset from Pandas format (pd.DataFrame) into TensorFlow Datasets format (tf.data.Dataset**

In [None]:
train_ds_pd

**by default the Random Forest Model is configured to train classification tasks. Since this is a regression problem, we will specify the type of the task (tfdf.keras.Task.REGRESSION) as a parameter here**

In [None]:
label = 'SalePrice'
train_all= tfdf.keras.pd_dataframe_to_tf_dataset(train, label=label, task = tfdf.keras.Task.REGRESSION)
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)

In [None]:
tfdf.keras.get_all_models()

### How can I configure them?
TensorFlow Decision Forests provides good defaults for you (e.g. the top ranking hyperparameters on our benchmarks, slightly modified to run in reasonable time). If you would like to configure the learning algorithm, you will find many options you can explore to get the highest possible accuracy.

> rf = tfdf.keras.RandomForestModel(hyperparameter_template="benchmark_rank1", task=tfdf.keras.Task.REGRESSION)

## Train the model

In [None]:
rf = tfdf.keras.RandomForestModel(hyperparameter_template="benchmark_rank1",\
                                  task = tfdf.keras.Task.REGRESSION)
rf.compile(metrics=["mse"])

In [None]:
rf.fit(x=train_ds)

## Visualize the model

In [None]:
tfdf.model_plotter.plot_model_in_colab(rf, tree_idx=0, max_depth=3)

In [None]:
inspector = rf.make_inspector()
inspector.evaluation()

In [None]:
evaluation = rf.evaluate(x=valid_ds,return_dict=True)

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")

In [None]:
for importance in inspector.variable_importances().keys():
  print("\t", importance)

In [None]:
inspector.variable_importances()["NUM_AS_ROOT"]

In [None]:
rf_all = tfdf.keras.RandomForestModel(hyperparameter_template="benchmark_rank1",\
                                  task = tfdf.keras.Task.REGRESSION)
rf_all.compile(metrics=["mse"])   # training with whole model
rf_all.fit(x=train_all)

In [None]:
test_file_path = "../input/house-prices-advanced-regression-techniques/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    test_data,
    task = tfdf.keras.Task.REGRESSION)

preds = rf_all.predict(test_ds)

In [None]:
preds.squeeze()

In [None]:
output = pd.DataFrame({'Id': ids,
                       'SalePrice': preds.squeeze()})

output.head()  # Score: 0.14894 rf model
                #Score: 0.14046  rf_all model

In [None]:
k='/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv'
pd.read_csv(k).head()

In [None]:
output.to_csv('/kaggle/working/submission.csv', index=False)