## Import libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder


## Preprocess data

In [2]:
# Loading data
df = pd.read_csv('./data/raw_train_data.csv')
#print(df.info())

# drop rows where at least 1 feature is missing
df.dropna(inplace=True)
#print(df.info())

# convert string features to integers
le = LabelEncoder()
df['ORIGIN'] = le.fit_transform(df['ORIGIN'])
df['DEST'] = le.fit_transform(df['DEST'])

# convert arrival delay feature from minutes to boolean
# 0 for on time, 1 for delay
df.loc[df['ARR_DELAY_NEW'] > 0.0, 'ARR_DELAY_NEW'] = 1



## Prepare DataFrame for Decision Tree


In [3]:
X = df.drop("ARR_DELAY_NEW", axis=1)
y = df["ARR_DELAY_NEW"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Train Decision Tree


In [4]:
clf = DecisionTreeClassifier()

# progress bar at training time
for _ in tqdm(range(100)):
    clf.fit(X_train, y_train)

100%|██████████| 100/100 [02:33<00:00,  1.53s/it]


## Test

In [5]:
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7961252151984676


## Visualize Decision Tree

In [8]:
# check the complexity of the tree
max_depth = clf.tree_.max_depth
node_count = clf.tree_.node_count
n_leaves = clf.tree_.n_leaves
print(f'Number of nodes: {node_count} \nNumber of leaves: {n_leaves} \nDepth:{max_depth}')

# Had problems visualizing it, too complex
# plt.figure(figsize=(20,10))
# plot_tree(clf, filled=True, rounded=True)
# plt.show()

Number of nodes: 176935 
Number of leaves: 88468 
Depth:51
