In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

KeyboardInterrupt: 

In [8]:
def load_data(file_path):
    """Load the dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Loaded DataFrame columns:", df.columns.tolist())  # Print the column names
    return df

def preprocess_data(df):
    """Preprocess the data by selecting relevant columns and scaling features."""
    # Define the desired columns
    desired_columns = ['fr1', 'fr2', 'fr3', 'fr4', 'fr5', 'fr6', 'fr7', 'fr8', 'fr9', 'fr10', 'fr11', 'fr12', 'fr13', 'fr14', 'fr15', 'fr16']
    
    # Check if the desired columns are in the DataFrame
    for col in desired_columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame.")
    
    # Select relevant columns
    df = df[desired_columns]
    
    # Scale features
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    
    return scaled_df

In [9]:
def train_model(X, y, n_estimators=100, max_depth=None):
    """Train a Random Forest Classifier model."""
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    return model.fit(X, y)

In [10]:
def evaluate_model(model, X, y):
    """Evaluate the model's performance."""
    y_pred = model.predict(X)
    
    mse = mean_squared_error(y, y_pred)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y, y_pred, average='weighted', zero_division=0)
    
    return {
        'mse': mse,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

: 

In [None]:
def main():
    # Load and preprocess data
    df = load_data('../Datasets/RefSeq.csv')  # Replace with your actual file path
    X = preprocess_data(df.drop('class', axis=1))  # Adjust column names as needed
    y = df['class']
    
    # Check if the 'class' column exists
    if 'class' not in df.columns:
        raise ValueError("Column 'class' not found in DataFrame.")

    # Define hyperparameters
    n_estimators_list = [50, 100, 200]
    max_depths = [None, 5, 10]

    # Perform cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    results = {}
    for n_estimators in n_estimators_list:
        for max_depth in max_depths:
            avg_mse = 0
            avg_accuracy = 0
            avg_precision = 0
            avg_recall = 0
            avg_f1 = 0
            
            for train_index, test_index in kf.split(X):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                
                model = train_model(X_train, y_train, n_estimators=n_estimators, max_depth=max_depth)
                eval_result = evaluate_model(model, X_test, y_test)
                
                avg_mse += eval_result['mse']
                avg_accuracy += eval_result['accuracy']
                avg_precision += eval_result['precision']
                avg_recall += eval_result['recall']
                avg_f1 += eval_result['f1']
            
            avg_mse /= 5
            avg_accuracy /= 5
            avg_precision /= 5
            avg_recall /= 5
            avg_f1 /= 5
            
            results[(n_estimators, max_depth)] = {
                'avg_mse': avg_mse,
                'avg_accuracy': avg_accuracy,
                'avg_precision': avg_precision,
                'avg_recall': avg_recall,
                'avg_f1': avg_f1
            }
    
    # Print results
    print("Cross-validation Results:")
    for params, stats in results.items():
        print(f"n_estimators={params[0]}, max_depth={params[1]}: "
              f"MSE={stats['avg_mse']:.4f}, "
              f"Accuracy={stats['avg_accuracy']:.4f}, "
              f"Precision={stats['avg_precision']:.4f}, "
              f"Recall={stats['avg_recall']:.4f}, "
              f"F1 Score={stats['avg_f1']:.4f}")

    # Train final model with best parameters
    best_params = min(results, key=lambda x: results[x]['avg_mse'])
    final_model = train_model(X, y, n_estimators=best_params[0], max_depth=best_params[1])
    
    # Evaluate final model
    final_eval = evaluate_model(final_model, X, y)
    print("\nFinal Model Evaluation:")
    print(f"MSE: {final_eval['mse']:.4f}")
    print(f"Accuracy: {final_eval['accuracy']:.4f}")
    print(f"Precision: {final_eval['precision']:.4f}")
    print(f"Recall: {final_eval['recall']:.4f}")
    print(f"F1 Score: {final_eval['f1']:.4f}")

if __name__ == "__main__":
    main()

Loaded DataFrame columns: ['fr1', 'fr2', 'fr3', 'fr4', 'fr5', 'fr6', 'fr7', 'fr8', 'fr9', 'fr10', 'fr11', 'fr12', 'fr13', 'fr14', 'fr15', 'fr16', 'class']
