# Load data

In [None]:
from utils import load_data
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

df = load_data(separate=False)
df_stock_only = df[['AIR_STOCK','NASDAQ', 'DOW', 'ARR_DEL15']]


# get the string columns
cat_cols = df.drop(columns=['ARR_DEL15', 'DISTANCE', 'NASDAQ', 'DOW', 'AIR_STOCK']).columns.tolist()


# Fit and transform the encoder on the selected columns
encoder = OrdinalEncoder()
encoded_cols = encoder.fit_transform(df[cat_cols])

# Convert the encoded columns to a DataFrame
encoded_df = pd.DataFrame(encoded_cols, columns=cat_cols)

# Drop the original columns from the DataFrame
df = df.drop(columns=cat_cols)

# Concatenate the encoded columns with the remaining columns
df = pd.concat([df, encoded_df], axis=1)

df

# Make balanced classes

In [None]:
import pandas as pd
# Determine the class with the fewer samples.
counts = df_stock_only['ARR_DEL15'].value_counts()
minority_class = counts.idxmin()
minority_count = counts[minority_class]

# Select a random sample of the larger class with the same size as the smaller class.
majority_class = 1 - minority_class
majority_count = counts[majority_class]
majority_sample = df_stock_only[df_stock_only['ARR_DEL15'] == majority_class].sample(n=minority_count, random_state=0)

# Combine the two classes into a single DataFrame.
minority_df = df_stock_only[df_stock_only['ARR_DEL15'] == minority_class]
balanced_df_stock_only = pd.concat([minority_df, majority_sample], axis=0)

minority_df = df[df['ARR_DEL15'] == minority_class]
majority_sample = df[df['ARR_DEL15'] == majority_class].sample(n=minority_count, random_state=0)
balanced_df = pd.concat([minority_df, majority_sample], axis=0)

balanced_df

# Stock only

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(balanced_df_stock_only.drop(columns=['ARR_DEL15']), balanced_df_stock_only['ARR_DEL15'], test_size=0.2, stratify=balanced_df_stock_only['ARR_DEL15'], random_state=0)

## Regular decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth': [i for i in range(1,25 ,1)]}

clf = DecisionTreeClassifier()

grid_search = GridSearchCV(clf, param_grid, cv=5, verbose=3, n_jobs=-1)

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Print the best depth found by the grid search
print("Best depth:", grid_search.best_params_['max_depth'])

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, PrecisionRecallDisplay

# Train the decision tree classifier
clf = DecisionTreeClassifier(max_depth = grid_search.best_params_['max_depth'])
# clf = DecisionTreeClassifier()

clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = clf.predict(X_test)
  
# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

print('Tree Depth: ', clf.get_depth())
print('Number of leaves: ', clf.get_n_leaves())
print('Accuracy:', accuracy)

# print the report
print(classification_report(y_test, y_pred))

display = PrecisionRecallDisplay.from_predictions(y_test, y_pred)
_ = display.ax_.set_title("Precision-Recall curve")

In [None]:
import matplotlib.pyplot as plt

feature_importances = pd.Series(clf.feature_importances_, index=X_train.columns)
feature_importances.sort_values().plot(kind='barh')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.show()

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a random forest classifier object
rf = RandomForestClassifier(n_estimators = 1000, n_jobs=-1, verbose=2, random_state=0)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Evaluate the performance of the model
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))


## Gradient boosted tree

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

# Instantiate the model with default hyperparameters
gbc = HistGradientBoostingClassifier(max_iter = 10000, verbose=1, n_iter_no_change = 100, random_state=0)

# Fit the model to the training data
gbc.fit(X_train, y_train)

# Predict on the test data
y_pred = gbc.predict(X_test)

# Evaluate the performance of the model
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

# All data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(balanced_df.drop(columns=['ARR_DEL15']), balanced_df['ARR_DEL15'], test_size=0.2, stratify=balanced_df['ARR_DEL15'], random_state=0)

## Regular decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth': [i for i in range(1,46)]}

clf = DecisionTreeClassifier()

grid_search = GridSearchCV(clf, param_grid, cv=5, verbose=3, n_jobs=-1)

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Print the best depth found by the grid search
print("Best depth:", grid_search.best_params_['max_depth'])

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, PrecisionRecallDisplay

# Train the decision tree classifier
clf = DecisionTreeClassifier(max_depth = grid_search.best_params_['max_depth'])
# clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = clf.predict(X_test)
  
# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

print('Tree Depth: ', clf.get_depth())
print('Number of leaves: ', clf.get_n_leaves())
print('Accuracy:', accuracy)

# print the report
print(classification_report(y_test, y_pred))

display = PrecisionRecallDisplay.from_predictions(y_test, y_pred)
_ = display.ax_.set_title("Precision-Recall curve")

In [None]:
import matplotlib.pyplot as plt
feature_importances = pd.Series(clf.feature_importances_, index=X_train.columns)
feature_importances.sort_values().plot(kind='barh')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.show()

## Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a random forest classifier object
rf = RandomForestClassifier(n_estimators = 1000, n_jobs=-1, verbose=2, random_state=0)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Evaluate the performance of the model
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

## Gradient boosted tree

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

# Instantiate the model with default hyperparameters
gbc = HistGradientBoostingClassifier(max_iter = 10000, verbose=1, n_iter_no_change = 100, random_state=0)

# Fit the model to the training data
gbc.fit(X_train, y_train)

# Predict on the test data
y_pred = gbc.predict(X_test)

# Evaluate the performance of the model
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))