In [None]:
import re
import pandas as pd

# Function to process the data and extract F1 scores and averages
def extract_f1_scores(file_path):
    # Initialize variables
    results = []
    fold_results = []
    fold_number = 0
    inside_report = False

    # Define the label pattern to match the class labels (a, b, c, d) and their F1 scores
    label_pattern = re.compile(r"\s*([abcd])\s+\d+\.\d+\s+\d+\.\d+\s+([\d.]+)\s+\d+")
    # Define the patterns for f1_macro and f1_weighted averages
    f1_macro_pattern = re.compile(r"macro avg\s+\d+\.\d+\s+\d+\.\d+\s+([\d.]+)")
    f1_weighted_pattern = re.compile(r"weighted avg\s+\d+\.\d+\s+\d+\.\d+\s+([\d.]+)")

    # Open the file and read line by line
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            # Check for the start of a new fold
            if line.startswith("FOLD"):
                fold_number += 1
                inside_report = False

            # Check for the start of the report section
            if "report:" in line:
                inside_report = True

            # If inside the report section, match the label and F1 score
            if inside_report:
                match = label_pattern.match(line)
                if match:
                    label = match.group(1)
                    f1_score = float(match.group(2))
                    results.append([f'Fold {fold_number}', label, f1_score])
                # Match for f1_macro and f1_weighted averages
                macro_match = f1_macro_pattern.search(line)
                if macro_match:
                    f1_macro = float(macro_match.group(1))
                    fold_results.append([f'Fold {fold_number}', 'f1_macro', f1_macro])
                weighted_match = f1_weighted_pattern.search(line)
                if weighted_match:
                    f1_weighted = float(weighted_match.group(1))
                    fold_results.append([f'Fold {fold_number}', 'f1_weighted', f1_weighted])

    # Convert the results to DataFrames
    df_classes = pd.DataFrame(results, columns=["Fold", "Class", "F1-Score"])
    df_averages = pd.DataFrame(fold_results, columns=["Fold", "Metric", "Value"])
    return df_classes, df_averages

# Path to the data file
file_path = "erzaehler_transferzeit_next_folds.txt"

# Extract F1 scores and averages, then display the DataFrames
df_classes, df_averages = extract_f1_scores(file_path)
print("F1 Scores DataFrame:")
print(df_classes)

print("\nF1 Macro and Weighted Averages DataFrame:")
print(df_averages)

# Calculate the average and standard deviation for each class
summary_classes_df = df_classes.groupby('Class')['F1-Score'].agg(['mean', 'std']).reset_index()
summary_classes_df.columns = ['Class', 'Average F1-Score', 'Standard Deviation']

# Calculate the average and standard deviation for macro and weighted F1 scores
summary_averages_df = df_averages.groupby('Metric')['Value'].agg(['mean', 'std']).reset_index()
summary_averages_df.columns = ['Metric', 'Average Value', 'Standard Deviation']

print("\nSummary DataFrame for Classes:")
print(summary_classes_df)

print("\nSummary DataFrame for Averages:")
print(summary_averages_df)

# Save the results to CSV files
df_classes.to_csv("f1_scores_by_fold.csv", index=False)
df_averages.to_csv("f1_averages_by_fold.csv", index=False)
summary_classes_df.to_csv("f1_scores_summary_classes.csv", index=False)
summary_averages_df.to_csv("f1_averages_summary.csv", index=False)

F1 Scores DataFrame:
       Fold Class  F1-Score
0    Fold 1     a    0.1818
1    Fold 1     b    0.8167
2    Fold 1     c    0.5714
3    Fold 1     d    0.3871
4    Fold 2     a    0.0000
5    Fold 2     b    0.9104
6    Fold 2     c    0.4615
7    Fold 2     d    0.7500
8    Fold 3     a    0.6000
9    Fold 3     b    0.8947
10   Fold 3     c    0.4706
11   Fold 3     d    0.5714
12   Fold 4     a    0.2500
13   Fold 4     b    0.7692
14   Fold 4     c    0.4762
15   Fold 4     d    0.6512
16   Fold 5     a    0.4444
17   Fold 5     b    0.8571
18   Fold 5     c    0.1250
19   Fold 5     d    0.6875
20   Fold 6     a    0.6667
21   Fold 6     b    0.8640
22   Fold 6     c    0.2500
23   Fold 6     d    0.3846
24   Fold 7     a    0.0000
25   Fold 7     b    0.8254
26   Fold 7     c    0.3077
27   Fold 7     d    0.5882
28   Fold 8     a    0.6154
29   Fold 8     b    0.8644
30   Fold 8     c    0.1538
31   Fold 8     d    0.5625
32   Fold 9     a    0.2857
33   Fold 9     b    0.9091

In [None]:
import re
import pandas as pd

data = open('erzaehler_vanilla_folds.txt', 'r').read()
data1 = open('erzaehler_down_folds.txt', 'r').read()

In [None]:
import re
import pandas as pd

# Function to process the data and extract F1 scores
def extract_f1_scores(file_path):
    # Initialize variables
    results = []
    fold_number = 0
    inside_report = False

    # Define the label pattern to match the class labels (a, b, c, d) and their F1 scores
    label_pattern = re.compile(r"\s*([fic|non-fic])\s+\d+\.\d+\s+\d+\.\d+\s+([\d.]+)\s+\d+")

    # Open the file and read line by line
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            # Check for the start of a new fold
            if line.startswith("FOLD"):
                fold_number += 1
                inside_report = False

            # Check for the start of the report section
            if "report:" in line:
                inside_report = True

            # If inside the report section, match the label and F1 score
            if inside_report:
                match = label_pattern.match(line)
                if match:
                    label = match.group(1)
                    f1_score = float(match.group(2))
                    results.append([f'Fold {fold_number}', label, f1_score])

    # Convert the results to a DataFrame
    df = pd.DataFrame(results, columns=["Fold", "Class", "F1-Score"])
    return df

# Path to the data file
file_path = "erzaehler_fic_folds.txt"

# Extract F1 scores and display the DataFrame
df = extract_f1_scores(file_path)
print(df)

# Save the results to a CSV file
df.to_csv("f1_scores_by_fold.csv", index=False)


error: bad character range n-f at position 11

In [None]:

# Calculate the average and standard deviation for each class
summary_df = df.groupby('Class')['F1-Score'].agg(['mean', 'std']).reset_index()
summary_df.columns = ['Class', 'Average F1-Score', 'Standard Deviation']
print("\nSummary DataFrame:")
print(summary_df)
summary_df.to_csv("f1_scores_summary.csv", index=False)


Summary DataFrame:
Empty DataFrame
Columns: [Class, Average F1-Score, Standard Deviation]
Index: []


In [None]:
import re
import pandas as pd

# Function to process the data and extract precision, recall, and F1 scores
def extract_scores(file_path):
    # Initialize variables
    results = []
    fold_number = 0
    inside_report = False

    # Define the label pattern to match the class labels (fic, non-fic) and their scores
    label_pattern = re.compile(r"\s*(fic|non-fic)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+\d+")

    # Open the file and read line by line
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            # Check for the start of a new fold
            if line.startswith("FOLD"):
                fold_number += 1
                inside_report = False

            # Check for the start of the report section
            if "report:" in line:
                inside_report = True

            # If inside the report section, match the label and scores
            if inside_report:
                match = label_pattern.match(line)
                if match:
                    label = match.group(1)
                    precision = float(match.group(2))
                    recall = float(match.group(3))
                    f1_score = float(match.group(4))
                    results.append([f'Fold {fold_number}', label, precision, recall, f1_score])

    # Convert the results to a DataFrame
    df = pd.DataFrame(results, columns=["Fold", "Class", "Precision", "Recall", "F1-Score"])
    return df

# Path to the data file
file_path = "erzaehler_fic_folds.txt"

# Extract scores and display the DataFrame
df = extract_scores(file_path)
print(df)

# Save the results to a CSV file
df.to_csv("scores_by_fold.csv", index=False)

# Calculate the average and standard deviation for each class and each metric
summary_df = df.groupby('Class').agg(
    Average_Precision=('Precision', 'mean'),
    StdDev_Precision=('Precision', 'std'),
    Average_Recall=('Recall', 'mean'),
    StdDev_Recall=('Recall', 'std'),
    Average_F1_Score=('F1-Score', 'mean'),
    StdDev_F1_Score=('F1-Score', 'std')
).reset_index()

print("\nSummary DataFrame:")
print(summary_df)
summary_df.to_csv("scores_summary.csv", index=False)


       Fold    Class  Precision  Recall  F1-Score
0    Fold 1      fic     0.6087  0.5833    0.5957
1    Fold 1  non-fic     0.8462  0.8594    0.8527
2    Fold 2      fic     0.6250  0.8333    0.7143
3    Fold 2  non-fic     0.9531  0.8714    0.9104
4    Fold 3      fic     0.7778  0.5000    0.6087
5    Fold 3  non-fic     0.8000  0.9333    0.8615
6    Fold 4      fic     0.7667  0.6571    0.7077
7    Fold 4  non-fic     0.7931  0.8679    0.8288
8    Fold 5      fic     0.6562  0.8400    0.7368
9    Fold 5  non-fic     0.9286  0.8254    0.8739
10   Fold 6      fic     0.6667  0.6667    0.6667
11   Fold 6  non-fic     0.8955  0.8955    0.8955
12   Fold 7      fic     0.7500  0.4000    0.5217
13   Fold 7  non-fic     0.7500  0.9310    0.8308
14   Fold 8      fic     0.8800  0.8462    0.8627
15   Fold 8  non-fic     0.9365  0.9516    0.9440
16   Fold 9      fic     0.9048  0.6552    0.7600
17   Fold 9  non-fic     0.8507  0.9661    0.9048
18  Fold 10      fic     0.8333  0.5357    0.6522


In [None]:
import re
import pandas as pd

# Function to process the data and extract precision, recall, and F1 scores
def extract_scores(file_path):
    # Initialize variables
    results = []
    fold_number = 0
    inside_report = False

    # Define the label pattern to match the class labels (fic, non-fic) and their scores
    label_pattern = re.compile(r"\s*(fic|non-fic)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+\d+")
    macro_avg_pattern = re.compile(r"\s*macro avg\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+\d+")

    # Open the file and read line by line
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            # Check for the start of a new fold
            if line.startswith("FOLD"):
                fold_number += 1
                inside_report = False

            # Check for the start of the report section
            if "report:" in line:
                inside_report = True

            # If inside the report section, match the label and scores
            if inside_report:
                label_match = label_pattern.match(line)
                macro_match = macro_avg_pattern.match(line)
                if label_match:
                    label = label_match.group(1)
                    precision = float(label_match.group(2))
                    recall = float(label_match.group(3))
                    f1_score = float(label_match.group(4))
                    results.append([f'Fold {fold_number}', label, precision, recall, f1_score])
                elif macro_match:
                    macro_f1_score = float(macro_match.group(3))
                    results.append([f'Fold {fold_number}', 'macro avg', None, None, macro_f1_score])

    # Convert the results to a DataFrame
    df = pd.DataFrame(results, columns=["Fold", "Class", "Precision", "Recall", "F1-Score"])
    return df

# Path to the data file
file_path = "erzaehler_fic_folds.txt"

# Extract scores and display the DataFrame
df = extract_scores(file_path)
print(df)

# Save the results to a CSV file
df.to_csv("scores_by_fold.csv", index=False)

# Calculate the average and standard deviation for each class and each metric
summary_df = df.groupby('Class').agg(
    Average_Precision=('Precision', 'mean'),
    StdDev_Precision=('Precision', 'std'),
    Average_Recall=('Recall', 'mean'),
    StdDev_Recall=('Recall', 'std'),
    Average_F1_Score=('F1-Score', 'mean'),
    StdDev_F1_Score=('F1-Score', 'std')
).reset_index()

# Handle macro avg separately
macro_df = df[df['Class'] == 'macro avg']
macro_summary = macro_df['F1-Score'].agg(
    Average_Macro_F1_Score='mean',
    StdDev_Macro_F1_Score='std'
)

# Add macro avg to summary_df
summary_df = summary_df.append({
    'Class': 'macro avg',
    'Average_Precision': None,
    'StdDev_Precision': None,
    'Average_Recall': None,
    'StdDev_Recall': None,
    'Average_F1_Score': macro_summary['Average_Macro_F1_Score'],
    'StdDev_F1_Score': macro_summary['StdDev_Macro_F1_Score']
}, ignore_index=True)

print("\nSummary DataFrame:")
print(summary_df)
summary_df.to_csv("scores_summary.csv", index=False)



       Fold      Class  Precision  Recall  F1-Score
0    Fold 1        fic     0.6087  0.5833    0.5957
1    Fold 1    non-fic     0.8462  0.8594    0.8527
2    Fold 1  macro avg        NaN     NaN    0.7242
3    Fold 2        fic     0.6250  0.8333    0.7143
4    Fold 2    non-fic     0.9531  0.8714    0.9104
5    Fold 2  macro avg        NaN     NaN    0.8124
6    Fold 3        fic     0.7778  0.5000    0.6087
7    Fold 3    non-fic     0.8000  0.9333    0.8615
8    Fold 3  macro avg        NaN     NaN    0.7351
9    Fold 4        fic     0.7667  0.6571    0.7077
10   Fold 4    non-fic     0.7931  0.8679    0.8288
11   Fold 4  macro avg        NaN     NaN    0.7683
12   Fold 5        fic     0.6562  0.8400    0.7368
13   Fold 5    non-fic     0.9286  0.8254    0.8739
14   Fold 5  macro avg        NaN     NaN    0.8054
15   Fold 6        fic     0.6667  0.6667    0.6667
16   Fold 6    non-fic     0.8955  0.8955    0.8955
17   Fold 6  macro avg        NaN     NaN    0.7811
18   Fold 7 

AttributeError: 'DataFrame' object has no attribute 'append'