In [2]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def extract_features(file_path):
    df = pd.read_csv(file_path, header=None)
    features = []
    
    for row in df.values:
        # Basic statistical features
        features.extend([np.mean(row),                    # Mean
                         np.std(row),                     # Standard deviation
                         np.median(row),                  # Median
                         np.percentile(row, 25),          # 1st quartile
                         np.percentile(row, 75),          # 3rd quartile
                         skew(row),                       # Skewness
                         kurtosis(row)                    # Kurtosis
        ])
        
        # Add frequency domain features
        freq_features = np.fft.fft(row)
        features.extend([np.abs(freq_features).mean(),    # Mean magnitude in frequency domain
                         np.abs(freq_features).std()      # Std of magnitude in frequency domain
        ])
    
    return features

def load_and_preprocess_data(train_folder, test_folder):
    
    # Load training data
    X_train = []
    y_train = []
    for file_name in os.listdir(train_folder):
        if file_name.endswith('.csv'):
            label = int(file_name.split('-')[0])
            file_path = os.path.join(train_folder, file_name)
            features = extract_features(file_path)
            X_train.append(features)
            y_train.append(label)
    
    # Load test data
    X_test = []
    test_file_names = []
    for file_name in os.listdir(test_folder):
        if file_name.endswith('.csv'):
            test_file_names.append(file_name)
            file_path = os.path.join(test_folder, file_name)
            features = extract_features(file_path)
            X_test.append(features)
    
    # Convert to numpy arrays
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Feature selection
    selector = SelectKBest(score_func=f_classif, k='all')
    selector.fit(X_train_scaled, y_train)
    feature_scores = pd.DataFrame({
        'Feature': range(X_train_scaled.shape[1]),
        'Score': selector.scores_
    })
    
    # Select top 50% of features
    k = X_train_scaled.shape[1] // 2
    top_features = feature_scores.nlargest(k, 'Score')['Feature'].values
    X_train_selected = X_train_scaled[:, top_features]
    X_test_selected = X_test_scaled[:, top_features]
    
    return X_train_selected, X_test_selected, y_train, test_file_names, feature_scores

def train_optimized_model(X_train, y_train):
    
    # Define parameter grid
    param_grid = {
        'n_estimators': [100],
        'max_depth': [None],
        'min_samples_split': [2],
        'min_samples_leaf': [2],
        'class_weight': [None]
    }
    
    # Initialize base model
    rf = RandomForestClassifier(random_state=42)
    
    # Perform grid search
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    print("Best parameters:", grid_search.best_params_)
    return grid_search.best_estimator_

# Load data and train model
train_folder = 'train'
test_folder = 'test'

X_train, X_test, y_train, test_file_names, feature_scores = load_and_preprocess_data(
    train_folder, test_folder
)

model = train_optimized_model(X_train, y_train)

# Predict class probabilities
y_pred_proba = model.predict_proba(X_test)

# Display top 5 files with highest probability for each class
top_n = 5
top_files_per_class = {}

for class_id in np.unique(y_train):  # Iterate over all classes
    class_indices = np.where(y_pred_proba[:, class_id] > 0)[0]  # Find the indices for the current class
    class_probs = y_pred_proba[class_indices, class_id]  # Get the probabilities for this class
    class_files = [test_file_names[i] for i in class_indices]  # Get the corresponding file names
    
    # Sort indices by probabilities (descending order) and get the top N files
    top_files_indices = np.argsort(class_probs)[-top_n:][::-1]
    top_files = [(class_files[i], class_probs[i]) for i in top_files_indices]
    
    top_files_per_class[class_id] = top_files

# Print the top 5 files per class
for class_id, top_files in top_files_per_class.items():
    print(f"\nClass {class_id} - Top 5 files with highest probability:")
    for file, prob in top_files:
        print(f"{file}: Probability = {prob:.4f}")

# Save feature importance analysis
feature_scores.to_csv('feature_importance.csv', index=False)

# Save predictions
results_df = pd.DataFrame({
    'Song Number': test_file_names,
    'Predicted Probabilities': [list(prob) for prob in y_pred_proba]
})
results_df.to_excel('predicted_probabilities.xlsx', index=False)


Best parameters: {'class_weight': None, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}

Class 0 - Top 5 files with highest probability:
87.csv: Probability = 1.0000
90.csv: Probability = 1.0000
81.csv: Probability = 1.0000
108.csv: Probability = 1.0000
16.csv: Probability = 0.9940

Class 1 - Top 5 files with highest probability:
40.csv: Probability = 0.7018
113.csv: Probability = 0.5558
104.csv: Probability = 0.5125
12.csv: Probability = 0.4938
32.csv: Probability = 0.4737

Class 2 - Top 5 files with highest probability:
47.csv: Probability = 0.6942
73.csv: Probability = 0.4852
53.csv: Probability = 0.4768
70.csv: Probability = 0.4668
89.csv: Probability = 0.4475

Class 3 - Top 5 files with highest probability:
46.csv: Probability = 0.7983
5.csv: Probability = 0.7558
9.csv: Probability = 0.7529
93.csv: Probability = 0.7190
97.csv: Probability = 0.7168

Class 4 - Top 5 files with highest probability:
106.csv: Probability = 0.6588
38.csv: Probabili