In [None]:
%cd PATH

In [None]:
!pip install unidecode

In [None]:
import pandas as pd
import os
import re
from unidecode import unidecode

# Function to map classifications
def mapear_classificacao(texto):
    if texto in [0.0, 1.0]:
        return texto

    texto = unidecode(texto).lower()  # Remove accents and convert to lowercase

    if re.search(r'\bsim\b', texto):
        return 1.0
    elif re.search(r'\bnao\b', texto):
        return 0.0
    else:  # For other variations or non-matching texts
        return 0.0

# Explicit choice of the 4 files
selected_files = [
    'Chatgpt_35turbo_datasetName_prompt_2_zeroshot.csv',
    'Maritaca_datasetName_zeroshot_prompt_1.csv',
    'Maritaca_datasetName_zeroshot_prompt_2.csv',
    'Bertimbau_test_datasetName.csv'
]

# Mapping model names to a name of your choice
model_name_mapping = {
    'Chatgpt_35turbo_datasetName_prompt_2_zeroshot': 'ChatGPT 3.5 Turbo Zero-Shot - Prompt 2',
    'Maritaca_datasetName_zeroshot_prompt_1': 'MariTalk (Sabiá-65B) Zero-shot - Prompt 1',
    'Maritaca_datasetName_zeroshot_prompt_2': 'MariTalk (Sabiá-65B) Zero-shot - Prompt 2',
    'Bertimbau_test_datasetName': 'BERTimbau Base'
}

path = 'results'

# Upload the first CSV
first_file = os.path.join(path, selected_files[0])
df = pd.read_csv(first_file)
final_df = df[['text', 'Toxic']]

# Add the predictions column from the first file to the final DataFrame
first_column_name = model_name_mapping[os.path.basename(os.path.splitext(selected_files[0])[0])]  # Maps file name to custom name
df['predictions'] = df['predictions'].apply(mapear_classificacao)
final_df[first_column_name] = df['predictions']

# Iterate over the other selected CSV files
for file in selected_files[1:]:
    file_path = os.path.join(path, file)
    temp_df = pd.read_csv(file_path)

    # Map the predictions column
    temp_df['predictions'] = temp_df['predictions'].apply(mapear_classificacao)

    # Rename the predictions column according to the mapping
    column_name = model_name_mapping[os.path.basename(os.path.splitext(file)[0])]  # Maps file name to custom name
    final_df[column_name] = temp_df['predictions']

#print(final_df.head())


In [None]:
!pip install mlxtend


In [None]:
import pandas as pd
import os
import re
from unidecode import unidecode
from sklearn.metrics import classification_report, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt

# Mapping model names to a name of your choice
model_name_mapping = {
    'Chatgpt_35turbo_datasetName_prompt_2_zeroshot': 'ChatGPT 3.5 Turbo Zero-Shot - Prompt 2',
    'Maritaca_datasetName_zeroshot_prompt_1': 'MariTalk (Sabiá-65B) Zero-shot - Prompt 1',
    'Maritaca_datasetName_zeroshot_prompt_2': 'MariTalk (Sabiá-65B) Zero-shot - Prompt 2',
    'Bertimbau_test_datasetName': 'BERTimbau Base'
}

# For each model, calculate the classification report and confusion matrix
for col in final_df.columns:
    if col not in ['text', 'Toxic']:
        # Get custom model name
        modelo_nome = model_name_mapping.get(col, col)  # If you can't find the mapped name, use the original name
        print(f"Modelo: {modelo_nome}")

        # Ranking Report
        print(classification_report(final_df['Toxic'], final_df[col]))

        # Confusion matrix
        cm = confusion_matrix(final_df['Toxic'], final_df[col])

        # Using mlxtend to plot the confusion matrix
        fig, ax = plot_confusion_matrix(conf_mat=cm,
                                        show_absolute=True,
                                        show_normed=True,
                                        colorbar=False,
                                        figsize=(10,7),
                                        cmap="Greys")
        ax.set_title(f'{modelo_nome}')
        plt.xlabel('Predicted labels')
        plt.ylabel('True labels')
        plt.show()

        # False positive and false negative rates
        tn, fp, fn, tp = cm.ravel()
        tpr = tp / (tp + fn)  # True positive rate (sensitivity)
        fpr = fp / (fp + tn)  # False positive rate
        fnr = fn / (fn + tp)  # False negative rate
        tnr = tn / (tn + fp)  # True negative rate (specificity)

        print(f"Taxa de Falso Positivo (FPR): {fpr:.2f}")
        print(f"Taxa de Falso Negativo (FNR): {fnr:.2f}\n")


In [None]:
import pandas as pd
import os
import re
from unidecode import unidecode
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Mapping the names of the model for a name of your choice
model_name_mapping = {
    'Chatgpt_35turbo_datasetName_prompt_2_zeroshot': 'ChatGPT 3.5 Turbo Zero-Shot - Prompt 2',
    'Maritaca_datasetName_zeroshot_prompt_1': 'MariTalk (Sabiá-65B) Zero-shot - Prompt 1',
    'Maritaca_datasetName_zeroshot_prompt_2': 'MariTalk (Sabiá-65B) Zero-shot - Prompt 2',
    'Bertimbau_test_datasetName': 'BERTimbau Base'
}

# List for storing metrics of each model
data = []

# For each model, calculate the classification report
for col in final_df.columns:
    if col not in ['text', 'Toxic']:
        # Obtain the personalized name of the model
        modelo_nome = model_name_mapping.get(col, col)  # If you don't find the name mapped, use the original name
        print(f"Modelo: {modelo_nome}")

        # Classification report
        report = classification_report(final_df['Toxic'], final_df[col], output_dict=True)
        precision_0, recall_0, f1_0 = report['0.0']['precision'], report['0.0']['recall'], report['0.0']['f1-score']
        precision_1, recall_1, f1_1 = report['1.0']['precision'], report['1.0']['recall'], report['1.0']['f1-score']

        # Add metrics to the list
        data.append([modelo_nome, precision_0, recall_0, f1_0, precision_1, recall_1, f1_1])

# Convert List to Dataframe
df_metrics = pd.DataFrame(data, columns=['Modelo', 'Precision_NonToxic', 'Recall_NonToxic', 'F1_NonToxic', 'Precision_Toxic', 'Recall_Toxic', 'F1_Toxic'])


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Function to create radar/spider chart
def plot_spider_chart(df, title):
    # Number of variables
    categories = list(df)[1:]
    N = len(categories)

    # Angles for each axis
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]

    # Initialize the chart
    ax = plt.subplot(111, polar=True)

    # First axis at the top
    ax.set_theta_offset(np.pi / 2)
    ax.set_theta_direction(-1)

    # Labels for each axis
    plt.xticks(angles[:-1], categories)

    # Define the label for the Y axis
    ax.set_rlabel_position(0)
    plt.yticks([0.2, 0.4, 0.6, 0.8], ["0.2", "0.4", "0.6", "0.8"], color="grey", size=7)
    plt.ylim(0,1)

    # Colors for each model
    colors = ['b', 'g', 'r', 'y', 'c', 'm', 'k', 'orange']


    # Plot metrics for each model
    for index, row in df.iterrows():
        values = row.drop('Modelo').values.flatten().tolist()
        values += values[:1]
        ax.plot(angles, values, linewidth=2, linestyle='solid', label=row['Modelo'], color=colors[index % len(colors)])
        ax.fill(angles, values, color=colors[index % len(colors)], alpha=0.1)

    # Legend
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

    # Title
    plt.title(title, size=11, color='blue', y=1.1)

# Plot radar/spider graph for all models on the same chart
plt.figure(figsize=(10, 8))
plot_spider_chart(df_metrics, "")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Function to create radar/spider chart
def plot_spider_chart(df, title):
    # Number of variables
    categories = list(df)[1:]
    N = len(categories)

    # Angles for each axis
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]

    # Initialize the chart
    ax = plt.subplot(111, polar=True)

    # First axis at the top
    ax.set_theta_offset(np.pi / 2)
    ax.set_theta_direction(-1)

    # Labels for each axis
    plt.xticks(angles[:-1], categories)

    # Define the label for the Y axis
    ax.set_rlabel_position(0)
    plt.yticks([0.2, 0.4, 0.6, 0.8], ["0.2", "0.4", "0.6", "0.8"], color="grey", size=7)
    plt.ylim(0,1)

    # Plot metrics to the model
    values = df.iloc[0].drop('Modelo').values.flatten().tolist()
    values += values[:1]
    ax.plot(angles, values, linewidth=2, linestyle='solid')
    ax.fill(angles, values, 'b', alpha=0.1)

    # Title
    plt.title(title, size=11, color='blue', y=1.1)

# Plot radar/spider graph for each model
for index, row in df_metrics.iterrows():
    plt.figure(figsize=(8, 6))
    plot_spider_chart(pd.DataFrame(row).T, row['Modelo'])
    plt.show()



## Instances in which Bertimbau hit exclusively

In [None]:

# Configure so that the dataframe columns are not truncated when displayed
pd.set_option('display.max_colwidth', None)

# Mask for instances in which Bertimbau_test_datasetName got it right
bertimbau_acertos = final_df['Bertimbau_test_dataseName'] == final_df['labels']

# Masks for instances in which other models made mistakes
outros_modelos_erros = [final_df[col] != final_df['labels'] for col in final_df.columns if col not in ['text', 'labels', 'Bertimbau_test_datasetName']]

# Combine all masks
mascara_final = bertimbau_acertos
for mascara in outros_modelos_erros:
    mascara_final = mascara_final & mascara

# Filter the dataframe using the final mask
resultados_exclusivos_bertimbau = final_df[mascara_final]

print(resultados_exclusivos_bertimbau)


## Instances in which maritaca zero-shot hit exclusively

In [None]:
# Configure so that the dataframe columns are not truncated when displayed
pd.set_option('display.max_colwidth', None)

# Mask for instances where maritaca_datasetName_zeroshot_prompt_2 hit
maritaca_acertos = final_df['Maritaca_datasetName_zeroshot_prompt_2'] == final_df['labels']

# Masks for instances in which other models made mistakes
outros_modelos_erros = [final_df[col] != final_df['labels'] for col in final_df.columns if col not in ['text', 'labels', 'Maritaca_datasetName_zeroshot_prompt_2']]

# Combine all masks
mascara_final = maritaca_acertos
for mascara in outros_modelos_erros:
    mascara_final = mascara_final & mascara

# Filter the dataframe using the final mask
resultados_exclusivos_maritaca = final_df[mascara_final]

print(resultados_exclusivos_maritaca)
