In [1]:
import pandas as pd
import numpy as np
import glob
import os, re, sys

### This processes the metrics collected by the Test script.

In [2]:
""" This loads all 499 CSV metrics files into pandas. """

# Find the metrics directory.
current_dir = os.getcwd()
metrics_dir = os.path.join(current_dir, "metrics")

# Identify all relevant CSV files.
pattern = os.path.join(metrics_dir, "xgb_df*_metrics.csv")
csv_files = glob.glob(pattern)
print(f"Found {len(csv_files)} files:")
for file in csv_files:
    print(f"  - {os.path.basename(file)}")

# Init list for df's.
dfs = []

# Seuqnetially read all CSV files and add to list.
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)
    print(f"read: {os.path.basename(file)}")

    # Concatenate.
    merged_df = pd.concat(dfs, ignore_index=True)

    # Group by the 'Model' column.
    grouped_df = merged_df.groupby('Model').mean().reset_index()

    # Export to CSV.
    output_file = os.path.join(current_dir, "metrics/merged_metrics.csv")
    grouped_df.to_csv(output_file, index=False)
    print(f"Merged saved as: {output_file}")

Found 499 files:
  - xgb_df310_metrics.csv
  - xgb_df409_metrics.csv
  - xgb_df159_metrics.csv
  - xgb_df272_metrics.csv
  - xgb_df360_metrics.csv
  - xgb_df479_metrics.csv
  - xgb_df129_metrics.csv
  - xgb_df484_metrics.csv
  - xgb_df202_metrics.csv
  - xgb_df177_metrics.csv
  - xgb_df427_metrics.csv
  - xgb_df74_metrics.csv
  - xgb_df107_metrics.csv
  - xgb_df457_metrics.csv
  - xgb_df89_metrics.csv
  - xgb_df352_metrics.csv
  - xgb_df95_metrics.csv
  - xgb_df68_metrics.csv
  - xgb_df230_metrics.csv
  - xgb_df322_metrics.csv
  - xgb_df196_metrics.csv
  - xgb_df18_metrics.csv
  - xgb_df240_metrics.csv
  - xgb_df46_metrics.csv
  - xgb_df381_metrics.csv
  - xgb_df498_metrics.csv
  - xgb_df465_metrics.csv
  - xgb_df135_metrics.csv
  - xgb_df36_metrics.csv
  - xgb_df415_metrics.csv
  - xgb_df145_metrics.csv
  - xgb_df293_metrics.csv
  - xgb_df53_metrics.csv
  - xgb_df394_metrics.csv
  - xgb_df120_metrics.csv
  - xgb_df470_metrics.csv
  - xgb_df369_metrics.csv
  - xgb_df23_metrics.csv
  - 

In [3]:
merged = pd.read_csv('metrics/merged_metrics.csv')

### First, sort overall metrics.

In [4]:
# Sort by accuracy.
sorted_merged = merged.sort_values('Accuracy', ascending=True)

# Save new CSV file.
sorted_merged.to_csv('metrics/sorted_accuracy.csv', index=False)
print("Saved.")

Saved.


In [5]:
# Sort by PREC.
sorted_merged = merged.sort_values('Precision', ascending=True)

# Save new CSV file.
sorted_merged.to_csv('metrics/sorted_precision.csv', index=False)
print("Saved.")

Saved.


In [6]:
print(merged.columns)

Index(['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC Score',
       'Overall MCC'],
      dtype='object')


In [7]:
# Sort by REC.
sorted_merged = merged.sort_values('Recall', ascending=True)

# Save new CSV file.
sorted_merged.to_csv('metrics/sorted_recall.csv', index=False)
print("Saved.")

Saved.


In [8]:
# Sort by F1 score.
sorted_merged = merged.sort_values('F1 Score', ascending=True)

# Save new CSV file.
sorted_merged.to_csv('metrics/sorted_f1.csv', index=False)
print("Saved.")

Saved.


In [9]:
# Sort by ROC AUC score.
sorted_merged = merged.sort_values('ROC AUC Score', ascending=True)

# Save new CSV file.
sorted_merged.to_csv('metrics/sorted_roc.csv', index=False)
print("Saved.")

Saved.


In [10]:
# Sort by MCC.
sorted_merged = merged.sort_values('Overall MCC', ascending=True)

# Save new CSV file.
sorted_merged.to_csv('metrics/sorted_mcc.csv', index=False)
print("Saved.")

Saved.


### Now, need to load and sort class-wise metrics.

In [11]:
# Find the metrics and report directories.
current_dir = os.getcwd()
metrics_dir = os.path.join(current_dir, "metrics")
report_dir = os.path.join(metrics_dir, "report")

# Make CSV list.
csv_files = glob.glob(os.path.join(report_dir, "xgb_df*_report.csv"))
print(f"Found {len(csv_files)} files.")

# Init list for data.
data = []

def extract_precision(content, class_name):
    """ 
    Extract PREC metrics. 
    """
    match = re.search(f"{class_name}\s+(\d+\.\d+)", content)
    return float(match.group(1)) if match else None

# Iterate over each file.
for file in csv_files:
    model_number = re.search(r'df(\d+)_report', file)
    if model_number:
        model_number = int(model_number.group(1))
    else:
        print(f"Error!")
        continue
    try:
        with open(file, 'r') as f:
            content = f.read()
        
        # Get PREC for each class.
        gof_precision = extract_precision(content, "GOF")
        lof_precision = extract_precision(content, "LOF")
        neutral_precision = extract_precision(content, "Neutral")
        
        # Add to list.
        data.append({
            'Model': model_number,
            'GOF Precision': gof_precision,
            'LOF Precision': lof_precision,
            'Neutral Precision': neutral_precision
        })
        print(f"processed: {os.path.basename(file)}")
    except Exception as e:
        print(f"Error! {os.path.basename(file)}: {str(e)}")

# Compile into df.
merged_df = pd.DataFrame(data)

if merged_df.empty:
    print("Error!")
else:
    merged_df.set_index('Model', inplace=True)
    merged_df.sort_index(inplace=True)
    output_file = os.path.join(metrics_dir, "merged_precision_report.csv")
    merged_df.to_csv(output_file)
    print(f"Merged saved as: {output_file}")

Found 499 files.
processed: xgb_df85_report.csv
processed: xgb_df381_report.csv
processed: xgb_df295_report.csv
processed: xgb_df481_report.csv
processed: xgb_df208_report.csv
processed: xgb_df161_report.csv
processed: xgb_df156_report.csv
processed: xgb_df449_report.csv
processed: xgb_df18_report.csv
processed: xgb_df103_report.csv
processed: xgb_df349_report.csv
processed: xgb_df134_report.csv
processed: xgb_df289_report.csv
processed: xgb_df99_report.csv
processed: xgb_df182_report.csv
processed: xgb_df455_report.csv
processed: xgb_df337_report.csv
processed: xgb_df33_report.csv
processed: xgb_df300_report.csv
processed: xgb_df462_report.csv
processed: xgb_df223_report.csv
processed: xgb_df214_report.csv
processed: xgb_df66_report.csv
processed: xgb_df355_report.csv
processed: xgb_df128_report.csv
processed: xgb_df437_report.csv
processed: xgb_df400_report.csv
processed: xgb_df362_report.csv
processed: xgb_df51_report.csv
processed: xgb_df241_report.csv
processed: xgb_df276_report.c

In [12]:
# Find the metrics and report directories.
current_dir = os.getcwd()
metrics_dir = os.path.join(current_dir, "metrics")
report_dir = os.path.join(metrics_dir, "report")

# Make CSV list.
csv_files = glob.glob(os.path.join(report_dir, "xgb_df*_report.csv"))
print(f"Found {len(csv_files)} files.")

# Init list for data.
data = []

def extract_recall(content, class_name):
    """ 
    Extract REC metrics. 
    """
    match = re.search(f"{class_name}\s+\d+\.\d+\s+(\d+\.\d+)", content)
    return float(match.group(1)) if match else None

# Iterate over each file.
for file in csv_files:
    model_number = re.search(r'df(\d+)_report', file)
    if model_number:
        model_number = int(model_number.group(1))
    else:
        print(f"Error!")
        continue
    try:
        with open(file, 'r') as f:
            content = f.read()
        
        # Get REC for each class.
        gof_recall = extract_recall(content, "GOF")
        lof_recall = extract_recall(content, "LOF")
        neutral_recall = extract_recall(content, "Neutral")
        
        # Add to list.
        data.append({
            'Model': model_number,
            'GOF Recall': gof_recall,
            'LOF Recall': lof_recall,
            'Neutral Recall': neutral_recall
        })
        print(f"processed: {os.path.basename(file)}")
    except Exception as e:
        print(f"Error! {os.path.basename(file)}: {str(e)}")

# Compile into df.
merged_df = pd.DataFrame(data)

if merged_df.empty:
    print("Error!")
else:
    merged_df.set_index('Model', inplace=True)
    merged_df.sort_index(inplace=True)
    output_file = os.path.join(metrics_dir, "merged_recall_report.csv")
    merged_df.to_csv(output_file)
    print(f"Merged saved as: {output_file}")

Found 499 files.
processed: xgb_df85_report.csv
processed: xgb_df381_report.csv
processed: xgb_df295_report.csv
processed: xgb_df481_report.csv
processed: xgb_df208_report.csv
processed: xgb_df161_report.csv
processed: xgb_df156_report.csv
processed: xgb_df449_report.csv
processed: xgb_df18_report.csv
processed: xgb_df103_report.csv
processed: xgb_df349_report.csv
processed: xgb_df134_report.csv
processed: xgb_df289_report.csv
processed: xgb_df99_report.csv
processed: xgb_df182_report.csv
processed: xgb_df455_report.csv
processed: xgb_df337_report.csv
processed: xgb_df33_report.csv
processed: xgb_df300_report.csv
processed: xgb_df462_report.csv
processed: xgb_df223_report.csv
processed: xgb_df214_report.csv
processed: xgb_df66_report.csv
processed: xgb_df355_report.csv
processed: xgb_df128_report.csv
processed: xgb_df437_report.csv
processed: xgb_df400_report.csv
processed: xgb_df362_report.csv
processed: xgb_df51_report.csv
processed: xgb_df241_report.csv
processed: xgb_df276_report.c

In [13]:
# Find the metrics and report directories.
current_dir = os.getcwd()
metrics_dir = os.path.join(current_dir, "metrics")
report_dir = os.path.join(metrics_dir, "report")

# Make CSV list.
csv_files = glob.glob(os.path.join(report_dir, "xgb_df*_report.csv"))

print(f"Found {len(csv_files)} files.")

# Init list for data.
data = []

def extract_f1_score(content, class_name):
    match = re.search(f"{class_name}\s+\d+\.\d+\s+\d+\.\d+\s+(\d+\.\d+)", content)
    return float(match.group(1)) if match else None

# Iterate over each file.
for file in csv_files:
    model_number = re.search(r'df(\d+)_report', file)
    if model_number:
        model_number = int(model_number.group(1))
    else:
        print(f"Error!")
        continue
    try:
        with open(file, 'r') as f:
            content = f.read()
        
        # Get REC for each class.
        gof_f1 = extract_f1_score(content, "GOF")
        lof_f1 = extract_f1_score(content, "LOF")
        neutral_f1 = extract_f1_score(content, "Neutral")
        
        # Add to list.
        data.append({
            'Model': model_number,
            'GOF F1': gof_f1,
            'LOF F1': lof_f1,
            'Neutral F1': neutral_f1
        })
        print(f"processed: {os.path.basename(file)}")
    except Exception as e:
        print(f"Error! {os.path.basename(file)}: {str(e)}")

# Compile into df.
merged_df = pd.DataFrame(data)

if merged_df.empty:
    print("Error!")
else:
    merged_df.set_index('Model', inplace=True)
    merged_df.sort_index(inplace=True)
    output_file = os.path.join(metrics_dir, "merged_f1_report.csv")
    merged_df.to_csv(output_file)
    print(f"Merged saved as: {output_file}")

Found 499 files.
processed: xgb_df85_report.csv
processed: xgb_df381_report.csv
processed: xgb_df295_report.csv
processed: xgb_df481_report.csv
processed: xgb_df208_report.csv
processed: xgb_df161_report.csv
processed: xgb_df156_report.csv
processed: xgb_df449_report.csv
processed: xgb_df18_report.csv
processed: xgb_df103_report.csv
processed: xgb_df349_report.csv
processed: xgb_df134_report.csv
processed: xgb_df289_report.csv
processed: xgb_df99_report.csv
processed: xgb_df182_report.csv
processed: xgb_df455_report.csv
processed: xgb_df337_report.csv
processed: xgb_df33_report.csv
processed: xgb_df300_report.csv
processed: xgb_df462_report.csv
processed: xgb_df223_report.csv
processed: xgb_df214_report.csv
processed: xgb_df66_report.csv
processed: xgb_df355_report.csv
processed: xgb_df128_report.csv
processed: xgb_df437_report.csv
processed: xgb_df400_report.csv
processed: xgb_df362_report.csv
processed: xgb_df51_report.csv
processed: xgb_df241_report.csv
processed: xgb_df276_report.c

In [14]:
# Find the metrics and report directories.
current_dir = os.getcwd()
metrics_dir = os.path.join(current_dir, "metrics")
report_dir = os.path.join(metrics_dir, "report")

# Make CSV list.
csv_files = glob.glob(os.path.join(report_dir, "xgb_df*_report.csv"))
print(f"Found {len(csv_files)} files.")

# Init list for data.
data = []

def extract_mcc(content, class_name):
    """ 
    Extract MCC metrics. 
    """
    match = re.search(f"{class_name}: (-?\d+\.\d+)", content)
    return float(match.group(1)) if match else None

# Iterate over each file.
for file in csv_files:
    model_number = re.search(r'df(\d+)_report', file)
    if model_number:
        model_number = int(model_number.group(1))
    else:
        print(f"Error!")
        continue
    try:
        with open(file, 'r') as f:
            content = f.read()
        
        # Get MCC for each class.
        neutral_mcc = extract_mcc(content, "Neutral")
        gof_mcc = extract_mcc(content, "GOF")
        lof_mcc = extract_mcc(content, "LOF")
        
        # Add to list.
        data.append({
            'Model': model_number,
            'Neutral MCC': neutral_mcc,
            'GOF MCC': gof_mcc,
            'LOF MCC': lof_mcc
        })
        print(f"processed: {os.path.basename(file)}")
    except Exception as e:
        print(f"Error! {os.path.basename(file)}: {str(e)}")

# Compile into df.
merged_df = pd.DataFrame(data)

if merged_df.empty:
    print("Error!")
else:
    merged_df.set_index('Model', inplace=True)
    merged_df.sort_index(inplace=True)
    output_file = os.path.join(metrics_dir, "merged_mcc_report.csv")
    merged_df.to_csv(output_file)
    print(f"Merged saved as: {output_file}")

Found 499 files.
processed: xgb_df85_report.csv
processed: xgb_df381_report.csv
processed: xgb_df295_report.csv
processed: xgb_df481_report.csv
processed: xgb_df208_report.csv
processed: xgb_df161_report.csv
processed: xgb_df156_report.csv
processed: xgb_df449_report.csv
processed: xgb_df18_report.csv
processed: xgb_df103_report.csv
processed: xgb_df349_report.csv
processed: xgb_df134_report.csv
processed: xgb_df289_report.csv
processed: xgb_df99_report.csv
processed: xgb_df182_report.csv
processed: xgb_df455_report.csv
processed: xgb_df337_report.csv
processed: xgb_df33_report.csv
processed: xgb_df300_report.csv
processed: xgb_df462_report.csv
processed: xgb_df223_report.csv
processed: xgb_df214_report.csv
processed: xgb_df66_report.csv
processed: xgb_df355_report.csv
processed: xgb_df128_report.csv
processed: xgb_df437_report.csv
processed: xgb_df400_report.csv
processed: xgb_df362_report.csv
processed: xgb_df51_report.csv
processed: xgb_df241_report.csv
processed: xgb_df276_report.c