In [1]:
!pip install tensorflow --user
!pip install tensorflow_decision_forests==1.6.0 

import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# %matplotlib inline



ERROR: Could not find a version that satisfies the requirement tensorflow_decision_forests==1.6.0 (from versions: none)
ERROR: No matching distribution found for tensorflow_decision_forests==1.6.0


ModuleNotFoundError: No module named 'tensorflow_decision_forests'

In [None]:
print(tf.__version__)
print(tfdf.__version__)

## Load Dataset

In [None]:
train_file_path = "./train.csv"
dataset_df = pd.read_csv(train_file_path)
print("Dataset shape is {}".format(dataset_df.shape))

In [None]:
dataset_df.head(3)

In [None]:
dataset_df = dataset_df.drop("Id", axis=1)
dataset_df.head(3)

In [None]:
dataset_df.info()

### House Price Distribution

In [None]:
print(dataset_df['SalesPrice'].describe())
plt.figure(figsize=(10, 9))
sns.displot(dataset_df['SalePrice'], color='r', bins=100, hist_kws={'alpha': 0.4})

### Numerical data distribution

In [None]:
list(set(dataset_df.dtypes.tolist()))

In [None]:
df_num = dataset_df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
df_num.hist(figsize=(15, 20), bins=50, xlabelsize=10, ylabelsize=10)

### Prepare the dataset

In [None]:
def split_dataset(dataset, test_ratio = 0.30):
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

train_ds_pd, valid_ds_pd, = split_dataset(dataset_df)

print("{} examples in training, {} examples in testing.".format(len(train_ds_pd), len(valid_ds_pd)))


### Convert dataset pandas format to tensorflow dataset format

In [None]:
label = 'SalePrice'

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)

In [None]:
tfdf.keras.get_all_models()

### Create Random Forest

In [None]:
select_task = tfdf.keras.Task.REGRESSION

In [None]:
rf = tfdf.keras.RandomForestModel( task = select_task)
rf.compile(metrics=['mse'])

### Train Model

In [None]:
rf.fit(x=train_ds)

### Visualize the Model

In [None]:
tfdf.model_plotter.plot_model_in_colab(rf, tree_idx=0, max_depth=3)

### Evaluate the model on the Out of Bag (OOB) data and the validation datset

In [None]:
logs = rf.make_inspector().training_logs()
plt.plot([log.num_trees for log in logs], [log.evaluation.rmse for log in logs])
plt.xLabel("Number of Trees")
plt.ylabel("RMSE (out-of-bag)")
plt.show()

In [None]:
inspector = rf.make_inspector()
inspector.evaluation()

In [None]:
evaluation = rf.evaluate(x=valid_ds, return_dict=True)

for name, value in evaluation.items():
    print(f"{name}: {value: .4f}")

### Variable importance

In [None]:
print(f"Available variable importances:")
for importance in inspector.variable_importances().keys():
    print("\t", importance)

In [None]:
inspector.variable_importances()['NUM_AS_ROOT']

In [None]:
plt.figure(figsize=(14, 5))

variable_importance_metrics = 'NUM_AS_ROOT'
variable_importances = inspector.variable_importances()[variable_importance_metrics]

features_name = [vi[0].name for vi in variable_importances]
features_importances = [vi[1] for vi in variable_importances]
features_rank = range(len(features_name))

bar = plt.barh(features_rank, features_importances, label=[str(x) for x in features_rank])
plt.yticks(features_rank, features_name)
plt.gca().invert_yaxis()

for importance, patch in zip(features_importances, bar.patches):
    plt.text(patch.get_x() + patch.get_width(), patch.get_y(), f"{importance}:.4f", va="top")


plt.xlabel(variable_importance_metrics)
plt.title("NUM AS ROOT of the class 1 vs the others")
plt.tight_layout()
plt.show()

### Submission

In [None]:
test_file_path = "./test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_data, task = select_task)

preds = rf.predict(test_ds)
output = pd.DataFrame({"Id": ids, "SalePrice": preds.squeeze()})

output.head()

In [None]:
sample_submission_df = pd.read_csv("./sample_submission.csv")

sample_submission_df['SalePrice'] = rf.predict(test_ds)
sample_submission_df.to_csv("./submission.csv", index=False)
sample_submission_df.head()