#### A quick introduction to Decision Trees using Python (Tutorial version)

### Import Packages

In [None]:
# data processing
import numpy as np
import pandas as pd

# modeling
#

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

### Simulate data

In [None]:
# let's simulate some data for this exercise
def simulate_linreg_data(rows):
    # set-up
    n_samples = rows
    np.random.seed(314)

    def true_fun(X):
        return -np.sin(2.2 * np.pi * X)

    # generate data
    x = np.sort(np.random.rand(n_samples))

    y = true_fun(x) + np.random.randn(n_samples) * 0.1
    
    return pd.DataFrame({'x': x, 'y': y})

df = simulate_linreg_data(20)

print(df.head())

### Plot data

In [None]:
sns.set(style='darkgrid')

plt.figure().set_size_inches(12, 9)

plt.scatter(df.x, df.y, color='lightcoral', s=100)

plt.xlabel('x', fontsize=14)
plt.ylabel('y', fontsize=14)
plt.title('Sample Data for Simple Linear Regression', fontsize=14, weight='semibold');

### Decision Tree

Step 1: Create a decision tree object.

In [None]:
# max leaf nodes
max_l = 3

dtree = 

Step 2: Fit the model.

In [None]:
x = 
y = 

#--

Step 3: Make predictions.

In [None]:
y_hat = 

Plot the actual vs. predicted values.

In [None]:
sns.set(style='darkgrid')
plt.figure().set_size_inches(12, 9)

plt.scatter(x, y, color='lightcoral', s=100, alpha=.8)
plt.scatter(x, y_hat, color='forestgreen', label=f'Predicted Values')

plt.axvspan(0, .48, alpha=0.1, color='red')
plt.axvspan(.48, .8, alpha=0.1, color='yellow')
plt.axvspan(0.8, 1, alpha=0.1, color='orange')

plt.xlabel('x', fontsize = 14)
plt.ylabel('y', fontsize = 14)
plt.xlim((0, 1))
plt.ylim((-2, 2))
plt.legend(loc='best', fontsize = 14)
plt.title('Decision Tree', fontsize=14, weight='semibold');

----

### Decision Tree model using Wine Quality data

In [None]:
df_wine = pd.read_csv(r'../data/winequality.csv', index_col=0)

# select the columns to be used as predictors in the model
num_cols = df_wine.columns[df_wine.dtypes == 'float64']

num_cols

In [None]:
# model training data
X = 
y = 

# define the model object with max depth of 5
dtree = 

# fit the model
dtree_wine = 

# make predictions
df_wine['preds'] = 

# actuals vs. predicted values
df_wine[['preds', 'quality']].head()

Mean Squared Error (MSE)

In [None]:
from sklearn.metrics import mean_squared_error

mse_wine = 

mse_wine

[Scikit-learn Metrics and Scoring: Documentation](https://scikit-learn.org/stable/modules/model_evaluation.html)

What happens if we build a model with the default settings...

In [None]:
dtree = 

dtree_wine = dtree.fit(df_wine[num_cols], df_wine.quality)

df_wine['preds'] = dtree_wine.predict(df_wine[num_cols])

mean_squared_error(df_wine.quality, df_wine.preds)

Beware!

In [None]:
df_wine[['quality', 'preds']].head()

In [None]:
# classification matrix
#-

In [None]:
# tree depth
#-

This is an example of over-fitting. Decision Tree is a _greedy_ algorithm, prone to over-fitting.

---

## *Exercise: Decision Tree model using House Prices data* 
[Data Source](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)

Step 1: Read data.

In [None]:
df_house = 

df_house.head()

Step 2: Define `X` and `y` using the columns specified below.

In [None]:
cols_to_use = ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 
               'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 
               'LowQualFinSF', 'GrLivArea', 'FullBath', 'HalfBath', 
               'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
               'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
               '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold',
               'YrSold']

target_col = 'SalePrice'

In [None]:
X = 
y = 

X.shape, y.shape

Step 3: Define a Decision Tree model object. Use `max_depth=5`.

In [None]:
dtree = 

Step 4: Train (fit) the model using `X` and `y`.

In [None]:
dtree_house = 

Step 5: Make predictions using the same (training) dataset `X`, and save those predictions in the `df_house` dataframe in a new column.

In [None]:
#-

Step 6: Find the correction between the predicted values and the actual target values.

In [None]:
#-

Step 7: Calculate MSE.

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(df_house.SalePrice, df_house.preds)

mse

How does MSE compare to the mean value of the target?

In [None]:
y_mean =

### Feature Importance

In [None]:
importances = 

In [None]:
indices = 

In [None]:
# print the feature ranking
#-

### splitting data into train and test partitions

A quick example to show how we can split the data for training and testing.