In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [20]:
# Prepare features (X) and target (y) from raw data.

#     Args:
#         data (DataFrame): The raw dataset as a Pandas DataFrame.
#         target_column (str): The name of the target column.

#     Returns:
#         X (DataFrame): Features.
#         y (Series): Target.

In [2]:
def load_data(file_path, target_column="target"):
    data = pd.read_csv(file_path)  # Use file_path to read the dataset
    X = data.drop(columns=[target_column])
    y = data[target_column]
    return X, y

In [22]:
# Scale features using StandardScaler.
#     Args:
#         X_train (DataFrame): Training features.
#         X_test (DataFrame): Testing features.

#     Returns:
#         X_train_scaled (DataFrame): Scaled training features.
#         X_test_scaled (DataFrame): Scaled testing features.
#     """

In [3]:
def preprocess_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

In [24]:
# Train and evaluate a Logistic Regression model.

In [4]:
def train_logistic_regression(X_train, y_train, X_test, y_test):
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    predictions = lr.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("Logistic Regression Accuracy:", accuracy)
    print(classification_report(y_test, predictions))
    return accuracy

In [26]:
# Train and evaluate a Decision Tree model.

In [5]:
def train_decision_tree(X_train, y_train, X_test, y_test):
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)
    predictions = dt.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("Decision Tree Accuracy:", accuracy)
    print(classification_report(y_test, predictions))
    return accuracy

In [28]:
# Train and evaluate a K-Nearest Neighbors (KNN) model.

In [6]:
def train_knn(X_train, y_train, X_test, y_test):
    n_neighbors = min(5, len(X_train))
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    predictions = knn.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("KNN Accuracy:", accuracy)
    print(classification_report(y_test, predictions, zero_division=0))
    return accuracy


In [16]:
def main_pipeline():

    # Define the dataset file path
    file_path = "expanded_dummy_stock_data.csv"  # Change this when using real data, this is just example

    # Load data
    X, y = load_data(file_path=file_path, target_column="target")

    # Split the data with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Preprocess the data
    X_train, X_test = preprocess_data(X_train, X_test)

    # Train and evaluate models
    print("\n--- Logistic Regression ---")
    train_logistic_regression(X_train, y_train, X_test, y_test)

    print("\n--- Decision Tree ---")
    train_decision_tree(X_train, y_train, X_test, y_test)

    print("\n--- KNN ---")
    train_knn(X_train, y_train, X_test, y_test)


In [17]:
main_pipeline()


--- Logistic Regression ---
Logistic Regression Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.50      1.00      0.67         2

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4


--- Decision Tree ---
Decision Tree Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.50      1.00      0.67         2

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4


--- KNN ---
KNN Accuracy: 0.75
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.67      1.00      0.80         2

    accuracy                        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
