# Step 1: Importing packages

In [None]:
#Import everything necessary
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from xgboost import XGBClassifier, plot_importance
from itertools import product

# Step 2: Loading dataset

In [None]:
#Import dataset & confirm everything is alright
df = pd.read_csv("work_with_this_dataset.csv")
print(df.head(100))
print(df.describe())
print(df["Trend"].unique())

# Step 3: Pre-processing the dataset

In [None]:
#Pre-processing steps
#Standardize
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)
#print(df_scaled.head()) #This is to make sure everything happened correctly

#Split
X = df_scaled.drop('Trend', axis=1)
y = df_scaled['Trend']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Baseline model

In [None]:
#Baseline logistic regression model
model_LR = LogisticRegression(random_state=42)
model_LR.fit(X_train, y_train)
LR_pred = model_LR.predict(X_test)

print(accuracy_score(y_test, LR_pred))
print(precision_score(y_test, LR_pred, average='weighted'))
print(recall_score(y_test, LR_pred, average='weighted'))
print(f1_score(y_test, LR_pred, average='weighted'))

# Step 5.1: Random Forest Model

In [None]:
# Random Forest Model Hyperparameter Tuning

n_estimators_range = range(150, 301, 10)
max_depth_range = range(20, 31, 5)
min_samples_split_range = [2, 5, 10]
min_samples_leaf_range = [1, 2, 4]

param_grid = product(n_estimators_range, max_depth_range, min_samples_split_range, min_samples_leaf_range)

best_accuracy = 0
best_model = None
best_params = None

for n_estimators, max_depth, min_samples_split, min_samples_leaf in param_grid:
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=42, n_jobs=-1)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"\nParams: n_estimators={n_estimators}, max_depth={max_depth}, "f"min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf}")
    print("Accuracy:", acc)

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        best_params = (n_estimators, max_depth, min_samples_split, min_samples_leaf)


print(f"\nBest Model Parameters: {best_params}")
print(f"Best Accuracy: {best_accuracy:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, best_model.predict(X_test)))
print("Classification Report:\n", classification_report(y_test, best_model.predict(X_test)))

This is what above code outputted (so you don't need to run it again):

Best Model Parameters: (300, 30, 10, 4)
Best Accuracy: 0.6907
Confusion Matrix:
 [[12166  8918]
 [ 4940 18782]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.71      0.58      0.64     21084
         1.0       0.68      0.79      0.73     23722

    accuracy                           0.69     44806
   macro avg       0.69      0.68      0.68     44806
weighted avg       0.69      0.69      0.69     44806

In [None]:
#Random forest model
model_RFC = RandomForestClassifier(n_estimators=300, max_depth=30, min_samples_split=10, min_samples_leaf=4, random_state=42)
model_RFC.fit(X_train, y_train)
RFC_pred = model_RFC.predict(X_test)

print(accuracy_score(y_test, RFC_pred))
print(precision_score(y_test, RFC_pred, average='weighted', zero_division=1))
print(recall_score(y_test, RFC_pred, average='weighted'))
print(f1_score(y_test, RFC_pred, average='weighted'))

Above cell outputted:

0.6907110654823014
0.6936552701085712
0.6907110654823014
0.6865660511611431

# Step 5.2: XGBoost Model

In [None]:
#Hyperparameter Tuning XGBoost

n_estimators = range(50, 101, 10)
max_depth = range(3, 5)
max_leaves = range(0, 16)
grow_policy = ['depthwise', 'lossguide']

param_grid_XG = product(n_estimators, max_depth, max_leaves, grow_policy)

best_accuracy = 0
best_model = None
best_params = None

for n_estimators, max_depth, max_leaves, grow_policy in param_grid_XG:
    if grow_policy == 'depthwise' and max_leaves > 0:
        continue
    if grow_policy == 'lossguide' and max_leaves == 0:
        continue

    model_XGB2 = XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42, n_estimators=n_estimators, max_depth=max_depth, max_leaves=max_leaves, grow_policy=grow_policy, tree_method='hist' if grow_policy == 'lossguide' else 'auto')

    model_XGB2.fit(X_train, y_train)
    y_pred = model_XGB2.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"\nParams: n_estimators={n_estimators}, max_depth={max_depth}, "f"max_leaves={max_leaves}, grow_policy={grow_policy}")
    print("Accuracy:", acc)

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model_XGB2
        best_params = (n_estimators, max_depth, max_leaves, grow_policy)

print(f"\nBest Model Params: {best_params}")
print(f"Best Accuracy: {best_accuracy}")
plot_importance(best_model)
plt.tight_layout()
plt.show()


Eventual output is:

Best Model Params: (100, 4, 15, 'lossguide')
Best Accuracy: 0.7048163192429585

In [None]:
#Hypertuned XGBoost model

model_XGB = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42, n_estimators=100, max_depth=4, max_leaves=15, grow_policy='lossguide', tree_method='auto')
model_XGB.fit(X_train, y_train)
y_pred_XGB = model_XGB.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_XGB))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_XGB))
print("Classification Report:\n", classification_report(y_test, y_pred_XGB))
xgb.plot_importance(model_XGB)