In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import the lib

In [None]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
print("Tensorflow v"+tf.__version__)
print("Tensorflow Decision Forests v"+tfdf.__version__)

# Load the dataset

In [None]:
train_file_path="../input/house-prices-advanced-regression-techniques/train.csv"
dataset=pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(dataset.shape))

In [None]:
dataset.head(5)

In [None]:
dataset=dataset.drop(['Id'],axis=1)
dataset.head(5)

In [None]:
dataset.info()

## House Price Distribution

In [None]:
import seaborn as sns

In [None]:
print(dataset['SalePrice'].describe())
plt.figure(figsize=(10,10))
sns.distplot(dataset['SalePrice'],color='g',bins=100,hist_kws={'alpha':0.4});

## Numeircal Data Distribution

In [None]:
list(set(dataset.dtypes.tolist()))

In [None]:
numerical_col=dataset.select_dtypes(include=['float64','int64'])
numerical_col.head()

In [None]:
numerical_col.hist(figsize=(20,20),bins=50,xlabelsize=8,ylabelsize=8);

## Prepare the dataset

In [None]:
dataset

In [None]:
import numpy as np

def split_dataset(dataset,test_ratio=0.30):
    test_index=np.random.rand(len(dataset))<test_ratio
    return dataset[~test_index],dataset[test_index]

train_pd,valid_pd=split_dataset(dataset)

print('{} examples in training ,{} examples in testing.'.format(len(train_pd),len(valid_pd)))
    

In [None]:
label='SalePrice'
train=tfdf.keras.pd_dataframe_to_tf_dataset(train_pd,label=label,task=tfdf.keras.Task.REGRESSION)
valid=tfdf.keras.pd_dataframe_to_tf_dataset(valid_pd,label=label,task=tfdf.keras.Task.REGRESSION)


In [None]:
tfdf.keras.get_all_models()

In [None]:
rf=tfdf.keras.RandomForestModel(task=tfdf.keras.Task.REGRESSION)
rf.compile(metrics=['mse'])

# train the model

In [None]:
rf.fit(x=train)

# Visualize the model

In [None]:
tfdf.model_plotter.plot_model_in_colab(rf,tree_idx=0,max_depth=10)

In [None]:
import matplotlib.pyplot as plt
logs=rf.make_inspector().training_logs()
plt.plot([log.num_trees for log in logs],[log.evaluation.rmse for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("RSME(out_of_bag)")
plt.show()

In [None]:
inspect=rf.make_inspector()
inspect.evaluation()

In [None]:
evaluation=rf.evaluate(x=valid,return_dict=True)

for name,value in evaluation.items():
    print(f"{name}:{value:.4f}")

# Variable importances

In [None]:
print(f'Available variable importances')
for importance in inspect.variable_importances().keys():
    print('\t',importance)

In [None]:
inspect.variable_importances()['NUM_AS_ROOT']

In [None]:
plt.figure(figsize=(10,10))

#MEAN decrease in AUC of the class 1 vs the others
variable_importance_metric='NUM_AS_ROOT'
variable_importances=inspect.variable_importances()[variable_importance_metric]

feature_names=[vi[0].name for vi in variable_importances]
feature_importances=[vi[1] for vi in variable_importances]

feature_ranks=range(len(feature_names))

bar=plt.barh(feature_ranks,feature_importances,label=[str(x) for x in feature_ranks])
plt.yticks(feature_ranks,feature_names)
plt.gca().invert_yaxis()

for importance,patch in zip(feature_importances,bar.patches):
    plt.text(patch.get_x()+patch.get_width(),patch.get_y(),f"{importance:.4f}",va='top')
    
    plt.xlabel(variable_importance_metric)
    plt.title("NUM AS ROOT of the class 1 vs the others")
    plt.tight_layout()
    plt.show()

# Submission

In [None]:
test_file_path='../input/house-prices-advanced-regression-techniques/test.csv'
test_data=pd.read_csv(test_file_path)
ids=test_data.pop('Id')

test=tfdf.keras.pd_dataframe_to_tf_dataset(test_data,task=tfdf.keras.Task.REGRESSION)

preds=rf.predict(test)
output=pd.DataFrame({'Id':ids,
                    'SalePrice':preds.squeeze()})

output.head()

In [None]:
sample_submission=pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
sample_submission['SalePrice']=rf.predict(test)
sample_submission.to_csv('/kaggle/working/submission.csv',index=False)
sample_submission.head()