In [13]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Create the output directory if it doesn't exist
output_folder = 'visualized_results'
os.makedirs(output_folder, exist_ok=True)

# Path to the directory containing CSV files
csv_folder = 'results'

# Function to create and save a bar chart
def plot_and_save_bar_chart(data, query_label, filename_suffix, query_type, threshold):
    plt.figure(figsize=(12, 6))  # Larger size to reduce crowding

    # Plot precision and recall as bar charts
    bar_width = 0.4
    indices = range(len(data))
    
    plt.bar(indices, data['precision'], width=bar_width, label='Precision', color='#FFB6C1')  # Warm pink color
    plt.bar([i + bar_width for i in indices], data['recall'], width=bar_width, label='Recall', color='#FFD700')  # Warm gold color

    plt.xlabel(f'{query_label.capitalize()} Queries')
    plt.ylabel('Scores')
    plt.title(f'{query_label.capitalize()} Queries - Query Type {query_type}, Threshold {threshold}')
    plt.xticks([i + bar_width / 2 for i in indices], data['query'], rotation=90)

    # Add legend and adjust layout
    plt.legend()
    plt.tight_layout()

    # Save the bar chart
    save_path = os.path.join(output_folder, f'{query_label}_query_type{query_type}_threshold{threshold}_{filename_suffix}.png')
    plt.savefig(save_path)
    plt.close()

# Loop through all CSV files in the 'results' folder
for csv_file in os.listdir(csv_folder):
    if csv_file.endswith('.csv'):
        print(f"Reading {csv_file}")
        # Extract query_type and threshold from the filename
        query_type = csv_file.split('_')[4]
        threshold = csv_file.split('_')[6].split('.')[0]

        # Read the CSV file
        file_path = os.path.join(csv_folder, csv_file)
        df = pd.read_csv(file_path)

        # Remove rows where 'query' is NaN in the entire row
        df_cleaned = df.dropna(subset=['query'])

        # Ensure we get exactly 100 single-word and 100 two-word queries after NaN removal
        df_single_word = df_cleaned.iloc[:100]  # First 100 valid rows after NaN removal
        df_two_word = df_cleaned.iloc[-100:]    # Last 100 valid rows after NaN removal

        # Plot for single-word queries (first 100 non-NaN queries)
        plot_and_save_bar_chart(df_single_word, 'single_word', 'first_100', query_type, threshold)

        # Plot for two-word queries (last 100 non-NaN queries)
        plot_and_save_bar_chart(df_two_word, 'two_word', 'last_100', query_type, threshold)

print("All bar charts have been successfully generated and saved.")


Reading results_with_query_type_6_threshold_5.csv
Reading results_with_query_type_1_threshold_1.csv
Reading results_with_query_type_3_threshold_50.csv
Reading results_with_query_type_1_threshold_10.csv
Reading results_with_query_type_4_threshold_75.csv
Reading results_with_query_type_2_threshold_25.csv
Reading results_with_query_type_6_threshold_25.csv
Reading results_with_query_type_5_threshold_10.csv
Reading results_with_query_type_6_threshold_1.csv
Reading results_with_query_type_1_threshold_5.csv
Reading results_with_query_type_2_threshold_50.csv
Reading results_with_query_type_3_threshold_25.csv
Reading results_with_query_type_5_threshold_75.csv
Reading results_with_query_type_1_threshold_75.csv
Reading results_with_query_type_6_threshold_50.csv
Reading results_with_query_type_4_threshold_10.csv
Reading results_with_query_type_2_threshold_100.csv
Reading results_with_query_type_6_threshold_75.csv
Reading results_with_query_type_2_threshold_5.csv
Reading results_with_query_type_5_t