# Classification Trees & Random Forest Classifier

## Libraries and settings

In [None]:
# Libraries
import os
import numpy as np
import pandas as pd
import sklearn
from sklearn import tree
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

## Import the apartment data

In [None]:
# Define columns for import
columns = [ 'web-scraper-order',
            'address_raw',
            'rooms',
            'area',
            'luxurious',
            'price',
            'price_per_m2',
            'lat',
            'lon',
            'bfs_number',
            'bfs_name',
            'pop',
            'pop_dens',
            'frg_pct',
            'emp',
            'tax_income',
            'dist_supermarket']

# Read and select variables
df_orig = pd.read_csv("apartments_data_enriched.csv", sep=";", encoding='utf-8')[columns]

# Rename variable 'web-scraper-order' to 'apmt_id'
df_orig = df_orig.rename(columns={'web-scraper-order': 'id'})

# Remove missing values
df = df_orig.dropna()
df.head(5)

# Remove duplicates
df = df.drop_duplicates()

# Remove some 'extreme' values
df = df.loc[(df['price'] >= 1000) & 
            (df['price'] <= 5000)]

print(df.shape)
df.head(5)

## Create new variable 'price_per_m2_class'

In [None]:
# Create labels
labels = ['0 - 19', '20 - 24', '25 - 32', '>= 33']

# Create new categorical variable
df["price_per_m2_class"] = pd.cut(df['price_per_m2'], bins=[0, 20, 25, 33, 150], labels=labels)

# Check values
df[['price_per_m2', 'price_per_m2_class']].head(10)

## Classification Tree
See also: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

### Create train and test samples (train = 80%, test = 20% of the data)

In [None]:
# Create train and test samples
X_train, X_test, y_train, y_test = train_test_split(df[['area', 
                                                        'rooms',
                                                        'pop_dens',
                                                        'tax_income',
                                                        'dist_supermarket']], 
                                                        df['price_per_m2_class'], 
                                                        test_size=0.20, 
                                                        random_state=42)

# Show X_train
print('X_train:')
print(X_train.head(), '\n')

# Show y_train
print('y_train:')
print(y_train.head())

### Fit the classification tree model

In [None]:
# Create decision tree regressor object
clf = DecisionTreeClassifier(random_state=20, 
                             max_depth=3)

# Train decision tree regressor
clf = clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

### Calculate accuracy

In [None]:
# Calculate accuracy
print(f'Accuracy: {metrics.accuracy_score(y_test, y_pred):.4f}')

### Cross validation

In [None]:
# Cross-validation (5-fold)
cv = cross_val_score(clf, 
                     X_test, 
                     y_test, 
                     cv=5, 
                     scoring='accuracy')

# Result
print(list(cv.round(4)), end=" ")

### Print text representation of the classification tree

In [None]:
# Text representation of the classification tree
text_representation = tree.export_text(clf, 
                                       feature_names=list(X_train.columns),
                                       max_depth=2)

# Print text_representation
print(text_representation)

### Vizualizing the classification tree

In [None]:
# For the meaning of numbers in each box, look at the root node
fig = plt.figure(figsize=(9,6))

# Plot the 
tree_plot = tree.plot_tree(clf, 
                           feature_names=list(X_train.columns),  
                           class_names=['0 - 19', '20 - 24', '25 - 32', '>= 33'],
                           filled=True,
                           fontsize=8,
                           label='root',
                           precision=1)

## Random Forest Classifier
For details see: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

### Fit the Random Forest Classifier

In [None]:
X, y = make_classification(n_features=4, n_informative=2,
                           random_state=5, shuffle=False)

clf_rf = RandomForestClassifier(n_estimators=500, 
                               max_depth=10, 
                               random_state=42)
clf_rf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf_rf.predict(X_test)

# Calculate accuracy
print(f'Accuracy: {metrics.accuracy_score(y_test, y_pred):.4f}')

### Show feature importance

In [None]:
cols = X_train.columns

# Derive feature importance from random forest
importances = clf_rf.feature_importances_
std         = np.std([tree.feature_importances_ for tree in clf_rf.estimators_], axis=0)
indices     = np.argsort(importances)[::-1]

# Print col-names and importances-values
print( cols[indices] )
print( importances[indices] )

# Barplot with feature importance
df_fi = pd.DataFrame({'features':cols,'importances': importances})
df_fi.sort_values('importances', inplace=True)
df_fi.plot(kind='barh', 
           y='importances', 
           x='features', 
           color='darkred', 
           figsize=(6,3))