In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV

In [20]:
# Prepare features (X) and target (y) from raw data.

#     Args:
#         data (DataFrame): The raw dataset as a Pandas DataFrame.
#         target_column (str): The name of the target column.

#     Returns:
#         X (DataFrame): Features.
#         y (Series): Target.

In [2]:
def load_data(file_path, target_column="target"):
    data = pd.read_csv(file_path)  # Use file_path to read the dataset
    X = data.drop(columns=[target_column])
    y = data[target_column]
    return X, y

In [22]:
# Scale features using StandardScaler.
#     Args:
#         X_train (DataFrame): Training features.
#         X_test (DataFrame): Testing features.

#     Returns:
#         X_train_scaled (DataFrame): Scaled training features.
#         X_test_scaled (DataFrame): Scaled testing features.
#     """

In [3]:
def preprocess_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

In [None]:
# Evaluation function with metrics to analyze any trained model
# Prints, confusion matrix, accuracy, precision, recall, f1 score, and other classification report metrics

In [None]:
def evaluate_predictions(y_true, y_pred):
    # Print confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Print accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.4f}")

    # Print classification report
    print("Classification Report:")
    print(classification_report(y_true, y_pred, zero_division=0))

In [24]:
# Train and evaluate a Logistic Regression model.

In [4]:
def train_logistic_regression(X_train, y_train, X_test, y_test):
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    predictions = lr.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("Logistic Regression Accuracy:", accuracy)
    print(classification_report(y_test, predictions))
    return accuracy

In [None]:
# Return the best decision tree by using GridSearchCV to get best max_depth, min_samples_leaf, and max_features

In [None]:
def get_best_decision_tree(X_train, y_train):

    dt = DecisionTreeClassifier(random_state=42)
    
    param_grid = {
        'max_depth': [5, 10, 15, 20],
        'min_samples_leaf': [5, 10, 15, 20],
        'max_features': [5, 10, 15]
    }
    grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
    grid_search.fit(X_train, y_train)
    
    best_dt = grid_search.best_estimator_    
    return best_dt

In [26]:
# Train and evaluate a Decision Tree model.

In [5]:
def train_decision_tree(X_train, y_train, X_test, y_test):
    dt = get_best_decision_tree(X_train, y_train)
    dt.fit(X_train, y_train)
    predictions = dt.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    evaluate_predictions(y_test, predictions)
    return accuracy

In [None]:
# Return the best KNN model by using GridSearchCV to get best K value and best weightage

In [None]:
def get_best_knn(X_train, y_train):
    knn = KNeighborsClassifier(n_neighbors=min(5, len(X_train)))
    param_grid = {
        'n_neighbors': range(1, min(10, len(X_train)) + 1),
        'weights': ['uniform', 'distance'],
    }
    grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
    grid_search.fit(X_train, y_train)
    best_knn = grid_search.best_estimator_
    return best_knn

In [None]:
# Train and evaluate a K-Nearest Neighbors (KNN) model. 
# Enhanced by using Grid Search CV to choose the best K value.

In [None]:
def train_knn(X_train, y_train, X_test, y_test):
    best_knn = get_best_knn(X_train, y_train)
    predictions = best_knn.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    evaluate_predictions(y_test, predictions)
    return accuracy


In [None]:
def get_best_svm(X_train, y_train):
    svm = SVC(random_state=42)
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto'],
        'degree': [3, 4, 5]  # Only used for 'poly' kernel
    }
    grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_svm = grid_search.best_estimator_   
    return best_svm

In [None]:
def train_svm(X_train, y_train, X_test, y_test):
    best_svm = get_best_svm(X_train, y_train)
    predictions = best_svm.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    evaluate_predictions(y_test, predictions)
    return accuracy

In [None]:
def train_knn_svm_ensemble(X_train, y_train, X_test, y_test):
    best_knn = get_best_knn(X_train, y_train)
    best_svm = get_best_svm(X_train, y_train)
    base_learners = [
        ('knn', best_knn),
    ]
    
    ensemble_model = StackingClassifier(estimators=base_learners, final_estimator=best_svm)
    ensemble_model.fit(X_train, y_train)
    predictions = ensemble_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, predictions)
    evaluate_predictions(y_test, predictions)
    
    return accuracy

In [None]:
def get_best_random_forest(X_train, y_train):
    rf = RandomForestClassifier(random_state=42)
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }    
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_rf = grid_search.best_estimator_    
    return best_rf

In [None]:
def train_random_forest(X_train, y_train, X_test, y_test):
    rf = get_best_random_forest(X_train, y_train)
    predictions = rf.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    evaluate_predictions(y_test, predictions)
    return accuracy

In [None]:
def get_best_mlp(X_train, y_train):
    mlp = MLPClassifier(random_state=42)
    param_grid = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
        'activation': ['relu', 'tanh'],
        'learning_rate': ['constant', 'adaptive'],
        'max_iter': [200, 300]
    }
    grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_mlp = grid_search.best_estimator_
    print("Best Parameters for MLP:", grid_search.best_params_)
    return best_mlp

In [None]:
def train_mlp(X_train, y_train, X_test, y_test):
    best_mlp = get_best_mlp(X_train, y_train)
    predictions = best_mlp.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    evaluate_predictions(y_test, predictions)
    return accuracy

In [None]:
def model_pipeline(file_path, target_column = "target"):
    X, y = load_data(file_path, target_column) 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_scaled, X_test_scaled = preprocess_data(X_train, X_test)
    
    print("Training and Evaluating Models:\n")

    print("\nLogistic Regression:")
    train_logistic_regression(X_train_scaled, y_train, X_test_scaled, y_test)
    
    print("\nDecision Tree:")
    train_decision_tree(X_train_scaled, y_train, X_test_scaled, y_test)
    
    print("\nKNN:")
    train_knn(X_train_scaled, y_train, X_test_scaled, y_test)
    
    print("\nSVM:")
    train_svm(X_train_scaled, y_train, X_test_scaled, y_test)
    
    print("\nKNN + SVM Ensemble:")
    train_knn_svm_ensemble(X_train_scaled, y_train, X_test_scaled, y_test)
    
    print("\nRandom Forest:")
    train_random_forest(X_train_scaled, y_train, X_test_scaled, y_test)
    
    print("\nMLP:")
    train_mlp(X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
model_pipeline("expanded_dummy_stock_data.csv")