In [1]:
import pandas as pd
import numpy as np
import glob
import os, re, sys

### This processes the metrics collected by the Test script.

In [2]:
""" This loads all 499 CSV metrics files into pandas. """

# Find the metrics directory.
metrics_dir = '../LightGBM/metrics/individual'


# Identify all relevant CSV files.
pattern = os.path.join(metrics_dir, "lgbm_df*_metrics.csv")
csv_files = glob.glob(pattern)
print(f"Found {len(csv_files)} files:")
for file in csv_files:
    print(f"  - {os.path.basename(file)}")

# Init list for df's.
dfs = []

# Seuqnetially read all CSV files and add to list.
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)
    print(f"read: {os.path.basename(file)}")

    # Concatenate.
    merged_df = pd.concat(dfs, ignore_index=True)

    # Group by the 'Model' column.
    grouped_df = merged_df.groupby('Model').mean().reset_index()

    # Export to CSV.
    output_file = "../LightGBM/metrics/merged_metrics.csv"
    grouped_df.to_csv(output_file, index=False)
    print(f"Merged saved as: {output_file}")

Found 499 files:
  - lgbm_df100_metrics.csv
  - lgbm_df101_metrics.csv
  - lgbm_df102_metrics.csv
  - lgbm_df103_metrics.csv
  - lgbm_df104_metrics.csv
  - lgbm_df105_metrics.csv
  - lgbm_df106_metrics.csv
  - lgbm_df107_metrics.csv
  - lgbm_df108_metrics.csv
  - lgbm_df109_metrics.csv
  - lgbm_df10_metrics.csv
  - lgbm_df110_metrics.csv
  - lgbm_df111_metrics.csv
  - lgbm_df112_metrics.csv
  - lgbm_df113_metrics.csv
  - lgbm_df114_metrics.csv
  - lgbm_df115_metrics.csv
  - lgbm_df116_metrics.csv
  - lgbm_df117_metrics.csv
  - lgbm_df118_metrics.csv
  - lgbm_df119_metrics.csv
  - lgbm_df11_metrics.csv
  - lgbm_df120_metrics.csv
  - lgbm_df121_metrics.csv
  - lgbm_df122_metrics.csv
  - lgbm_df123_metrics.csv
  - lgbm_df124_metrics.csv
  - lgbm_df125_metrics.csv
  - lgbm_df126_metrics.csv
  - lgbm_df127_metrics.csv
  - lgbm_df128_metrics.csv
  - lgbm_df129_metrics.csv
  - lgbm_df12_metrics.csv
  - lgbm_df130_metrics.csv
  - lgbm_df131_metrics.csv
  - lgbm_df132_metrics.csv
  - lgbm_df133

In [3]:
merged = pd.read_csv('../LightGBM/metrics/merged_metrics.csv')

### First, sort overall metrics.

In [4]:
# Sort by accuracy.
sorted_merged = merged.sort_values('Accuracy', ascending=True)

# Save new CSV file.
sorted_merged.to_csv('../LightGBM/metrics/sorted_accuracy.csv', index=False)
print("Saved.")

Saved.


In [5]:
# Sort by PREC.
sorted_merged = merged.sort_values('Precision', ascending=True)

# Save new CSV file.
sorted_merged.to_csv('../LightGBM/metrics/sorted_precision.csv', index=False)
print("Saved.")

Saved.


In [6]:
print(merged.columns)

Index(['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC Score',
       'Overall MCC'],
      dtype='object')


In [7]:
# Sort by REC.
sorted_merged = merged.sort_values('Recall', ascending=True)

# Save new CSV file.
sorted_merged.to_csv('../LightGBM/metrics/sorted_recall.csv', index=False)
print("Saved.")

Saved.


In [8]:
# Sort by F1 score.
sorted_merged = merged.sort_values('F1 Score', ascending=True)

# Save new CSV file.
sorted_merged.to_csv('../LightGBM/metrics/sorted_f1.csv', index=False)
print("Saved.")

Saved.


In [9]:
# Sort by ROC AUC score.
sorted_merged = merged.sort_values('ROC AUC Score', ascending=True)

# Save new CSV file.
sorted_merged.to_csv('../LightGBM/metrics/sorted_roc.csv', index=False)
print("Saved.")

Saved.


In [10]:
# Sort by MCC.
sorted_merged = merged.sort_values('Overall MCC', ascending=True)

# Save new CSV file.
sorted_merged.to_csv('../LightGBM/metrics/sorted_mcc.csv', index=False)
print("Saved.")

Saved.


### Now, need to load and sort class-wise metrics.

In [11]:
# Find the metrics and report directories.
metrics_dir = '../LightGBM/metrics/report'

# Make CSV list.
csv_files = glob.glob(os.path.join(metrics_dir, "reportlgbm_df*_report.csv"))
print(f"Found {len(csv_files)} report files.")

# Init list for data.
data = []

def extract_precision(content, class_name):
    """ 
    Extract PREC metrics. 
    """
    match = re.search(f"{class_name}\s+(\d+\.\d+)", content)
    return float(match.group(1)) if match else None

# Iterate over each file.
for file in csv_files:
    model_number = re.search(r'df(\d+)_report', file)
    if model_number:
        model_number = int(model_number.group(1))
    else:
        print(f"Error!")
        continue
    try:
        with open(file, 'r') as f:
            content = f.read()
        
        # Get PREC for each class.
        gof_precision = extract_precision(content, "GOF")
        lof_precision = extract_precision(content, "LOF")
        neutral_precision = extract_precision(content, "Neutral")
        
        # Add to list.
        data.append({
            'Model': model_number,
            'GOF Precision': gof_precision,
            'LOF Precision': lof_precision,
            'Neutral Precision': neutral_precision
        })
        print(f"processed: {os.path.basename(file)}")
    except Exception as e:
        print(f"Error! {os.path.basename(file)}: {str(e)}")

# Compile into df.
merged_df = pd.DataFrame(data)

if merged_df.empty:
    print("Error!")
else:
    merged_df.set_index('Model', inplace=True)
    merged_df.sort_index(inplace=True)
    output_file = os.path.join(metrics_dir, "merged_precision_report.csv")
    merged_df.to_csv(output_file)
    print(f"Merged saved as: {output_file}")

Found 499 report files.
processed: reportlgbm_df100_report.csv
processed: reportlgbm_df101_report.csv
processed: reportlgbm_df102_report.csv
processed: reportlgbm_df103_report.csv
processed: reportlgbm_df104_report.csv
processed: reportlgbm_df105_report.csv
processed: reportlgbm_df106_report.csv
processed: reportlgbm_df107_report.csv
processed: reportlgbm_df108_report.csv
processed: reportlgbm_df109_report.csv
processed: reportlgbm_df10_report.csv
processed: reportlgbm_df110_report.csv
processed: reportlgbm_df111_report.csv
processed: reportlgbm_df112_report.csv
processed: reportlgbm_df113_report.csv
processed: reportlgbm_df114_report.csv
processed: reportlgbm_df115_report.csv
processed: reportlgbm_df116_report.csv
processed: reportlgbm_df117_report.csv
processed: reportlgbm_df118_report.csv
processed: reportlgbm_df119_report.csv
processed: reportlgbm_df11_report.csv
processed: reportlgbm_df120_report.csv
processed: reportlgbm_df121_report.csv
processed: reportlgbm_df122_report.csv
pro

In [12]:
# Find the metrics and report directories.
metrics_dir = '../LightGBM/metrics/report'

# Make CSV list.
csv_files = glob.glob(os.path.join(metrics_dir, "reportlgbm_df*_report.csv"))
print(f"Found {len(csv_files)} report files.")

# Init list for data.
data = []

def extract_recall(content, class_name):
    """ 
    Extract REC metrics. 
    """
    match = re.search(f"{class_name}\s+\d+\.\d+\s+(\d+\.\d+)", content)
    return float(match.group(1)) if match else None

# Iterate over each file.
for file in csv_files:
    model_number = re.search(r'df(\d+)_report', file)
    if model_number:
        model_number = int(model_number.group(1))
    else:
        print(f"Error!")
        continue
    try:
        with open(file, 'r') as f:
            content = f.read()
        
        # Get REC for each class.
        gof_recall = extract_recall(content, "GOF")
        lof_recall = extract_recall(content, "LOF")
        neutral_recall = extract_recall(content, "Neutral")
        
        # Add to list.
        data.append({
            'Model': model_number,
            'GOF Recall': gof_recall,
            'LOF Recall': lof_recall,
            'Neutral Recall': neutral_recall
        })
        print(f"processed: {os.path.basename(file)}")
    except Exception as e:
        print(f"Error! {os.path.basename(file)}: {str(e)}")

# Compile into df.
merged_df = pd.DataFrame(data)

if merged_df.empty:
    print("Error!")
else:
    merged_df.set_index('Model', inplace=True)
    merged_df.sort_index(inplace=True)
    output_file = os.path.join(metrics_dir, "merged_precision_report.csv")
    merged_df.to_csv(output_file)
    print(f"Merged saved as: {output_file}")

Found 499 report files.
processed: reportlgbm_df100_report.csv
processed: reportlgbm_df101_report.csv
processed: reportlgbm_df102_report.csv
processed: reportlgbm_df103_report.csv
processed: reportlgbm_df104_report.csv
processed: reportlgbm_df105_report.csv
processed: reportlgbm_df106_report.csv
processed: reportlgbm_df107_report.csv
processed: reportlgbm_df108_report.csv
processed: reportlgbm_df109_report.csv
processed: reportlgbm_df10_report.csv
processed: reportlgbm_df110_report.csv
processed: reportlgbm_df111_report.csv
processed: reportlgbm_df112_report.csv
processed: reportlgbm_df113_report.csv
processed: reportlgbm_df114_report.csv
processed: reportlgbm_df115_report.csv
processed: reportlgbm_df116_report.csv
processed: reportlgbm_df117_report.csv
processed: reportlgbm_df118_report.csv
processed: reportlgbm_df119_report.csv
processed: reportlgbm_df11_report.csv
processed: reportlgbm_df120_report.csv
processed: reportlgbm_df121_report.csv
processed: reportlgbm_df122_report.csv
pro

In [13]:
# Find the metrics and report directories.
metrics_dir = '../LightGBM/metrics/report'

# Make CSV list.
csv_files = glob.glob(os.path.join(metrics_dir, "reportlgbm_df*_report.csv"))
print(f"Found {len(csv_files)} report files.")

# Init list for data.
data = []

def extract_f1_score(content, class_name):
    match = re.search(f"{class_name}\s+\d+\.\d+\s+\d+\.\d+\s+(\d+\.\d+)", content)
    return float(match.group(1)) if match else None

# Iterate over each file.
for file in csv_files:
    model_number = re.search(r'df(\d+)_report', file)
    if model_number:
        model_number = int(model_number.group(1))
    else:
        print(f"Error!")
        continue
    try:
        with open(file, 'r') as f:
            content = f.read()
        
        # Get REC for each class.
        gof_f1 = extract_f1_score(content, "GOF")
        lof_f1 = extract_f1_score(content, "LOF")
        neutral_f1 = extract_f1_score(content, "Neutral")
        
        # Add to list.
        data.append({
            'Model': model_number,
            'GOF F1': gof_f1,
            'LOF F1': lof_f1,
            'Neutral F1': neutral_f1
        })
        print(f"processed: {os.path.basename(file)}")
    except Exception as e:
        print(f"Error! {os.path.basename(file)}: {str(e)}")

# Compile into df.
merged_df = pd.DataFrame(data)

if merged_df.empty:
    print("Error!")
else:
    merged_df.set_index('Model', inplace=True)
    merged_df.sort_index(inplace=True)
    output_file = os.path.join(metrics_dir, "merged_precision_report.csv")
    merged_df.to_csv(output_file)
    print(f"Merged saved as: {output_file}")

Found 499 report files.
processed: reportlgbm_df100_report.csv
processed: reportlgbm_df101_report.csv
processed: reportlgbm_df102_report.csv
processed: reportlgbm_df103_report.csv
processed: reportlgbm_df104_report.csv
processed: reportlgbm_df105_report.csv
processed: reportlgbm_df106_report.csv
processed: reportlgbm_df107_report.csv
processed: reportlgbm_df108_report.csv
processed: reportlgbm_df109_report.csv
processed: reportlgbm_df10_report.csv
processed: reportlgbm_df110_report.csv
processed: reportlgbm_df111_report.csv
processed: reportlgbm_df112_report.csv
processed: reportlgbm_df113_report.csv
processed: reportlgbm_df114_report.csv
processed: reportlgbm_df115_report.csv
processed: reportlgbm_df116_report.csv
processed: reportlgbm_df117_report.csv
processed: reportlgbm_df118_report.csv
processed: reportlgbm_df119_report.csv
processed: reportlgbm_df11_report.csv
processed: reportlgbm_df120_report.csv
processed: reportlgbm_df121_report.csv
processed: reportlgbm_df122_report.csv
pro

In [14]:
# Find the metrics and report directories.
metrics_dir = '../LightGBM/metrics/report'

# Make CSV list.
csv_files = glob.glob(os.path.join(metrics_dir, "reportlgbm_df*_report.csv"))
print(f"Found {len(csv_files)} report files.")

# Init list for data.
data = []

def extract_mcc(content, class_name):
    """ 
    Extract MCC metrics. 
    """
    match = re.search(f"{class_name}: (-?\d+\.\d+)", content)
    return float(match.group(1)) if match else None

# Iterate over each file.
for file in csv_files:
    model_number = re.search(r'df(\d+)_report', file)
    if model_number:
        model_number = int(model_number.group(1))
    else:
        print(f"Error!")
        continue
    try:
        with open(file, 'r') as f:
            content = f.read()
        
        # Get MCC for each class.
        neutral_mcc = extract_mcc(content, "Neutral")
        gof_mcc = extract_mcc(content, "GOF")
        lof_mcc = extract_mcc(content, "LOF")
        
        # Add to list.
        data.append({
            'Model': model_number,
            'Neutral MCC': neutral_mcc,
            'GOF MCC': gof_mcc,
            'LOF MCC': lof_mcc
        })
        print(f"processed: {os.path.basename(file)}")
    except Exception as e:
        print(f"Error! {os.path.basename(file)}: {str(e)}")

# Compile into df.
merged_df = pd.DataFrame(data)

if merged_df.empty:
    print("Error!")
else:
    merged_df.set_index('Model', inplace=True)
    merged_df.sort_index(inplace=True)
    output_file = os.path.join(metrics_dir, "merged_precision_report.csv")
    merged_df.to_csv(output_file)
    print(f"Merged saved as: {output_file}")

Found 499 report files.
processed: reportlgbm_df100_report.csv
processed: reportlgbm_df101_report.csv
processed: reportlgbm_df102_report.csv
processed: reportlgbm_df103_report.csv
processed: reportlgbm_df104_report.csv
processed: reportlgbm_df105_report.csv
processed: reportlgbm_df106_report.csv
processed: reportlgbm_df107_report.csv
processed: reportlgbm_df108_report.csv
processed: reportlgbm_df109_report.csv
processed: reportlgbm_df10_report.csv
processed: reportlgbm_df110_report.csv
processed: reportlgbm_df111_report.csv
processed: reportlgbm_df112_report.csv
processed: reportlgbm_df113_report.csv
processed: reportlgbm_df114_report.csv
processed: reportlgbm_df115_report.csv
processed: reportlgbm_df116_report.csv
processed: reportlgbm_df117_report.csv
processed: reportlgbm_df118_report.csv
processed: reportlgbm_df119_report.csv
processed: reportlgbm_df11_report.csv
processed: reportlgbm_df120_report.csv
processed: reportlgbm_df121_report.csv
processed: reportlgbm_df122_report.csv
pro