## Decision Tree Classifier - Beta

In [2]:
import pandas as  pd

# Using 111 overlapping biomarkers for initial model
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('../processed_data/miRNA_stage_subtype.csv')
fold_change = pd.read_csv('../results/fold_change_results.csv')
chi_squared = pd.read_csv('../results/chi_squared_features.csv')

# Get overlapping features
fold_change_features = set(fold_change['Unnamed: 0'][:500])
chi_squared_features = set(chi_squared['Feature'][:500])
overlapping_features = list(fold_change_features.intersection(chi_squared_features))

# Filter features
miRNA_features = [f for f in overlapping_features if f in data.columns
                 and f not in ['stage', 'subtype']]

# Test on both stage and subtype
targets = ['stage', 'subtype']
for target in targets:
    print(f"Testing classification on: {target}")

    # Prepare data
    X = data[miRNA_features]
    y = data[target]

    # Split data and train model
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf = DecisionTreeClassifier(max_depth=5)
    clf.fit(X_train, y_train)

    # Evaluate model
    accuracy = clf.score(X_test, y_test)
    print(f"Baseline model accuracy for {target}: {accuracy:.2f}")

Testing classification on: stage
Baseline model accuracy for stage: 0.47
Testing classification on: subtype
Baseline model accuracy for subtype: 0.49
