### Imports and defenitions

In [1]:
import os 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import re  # for filename sanitization
import matplotlib.patches as mpatches

from globals import *

# Define file path
file_path = survey_data

colors = ['#FFFF99', '#FFEDA0', '#C7E9B4', '#7FCDBB', '#2C7FB8']



### A and B.2 Familiarity with Ethics Principles 


In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import re
import os

def read_and_clean_csv(file_path):
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
        print(f"Successfully read {file_path}.")
        return df
    except UnicodeDecodeError:
        print(f"Failed to read {file_path} with utf-8 encoding.")
        return None

# Define the importance levels
importance_levels = [
    'Not at all Important',
    'Slightly Important',
    'Moderately Important',
    'Very Important',
    'Extremely Important'
]

# Define the list of privacy values
privacy_values = [
    'Individualism (e.g., Independence)',
    'Collectivism (e.g., Community influence)',
    'Power Distance (e.g., Income inequality)',
    'Uncertainty Avoidance (e.g., How well you handle the unknown)',
    'Masculinity (e.g., Focus on achieving goals)',
    'Femininity (e.g., Focus on building relations)',
    'Long-term Orientation (e.g., Future focused)',
    'Short-term Orientation (e.g., Present focused)',
    'Trust (e.g., Do you trust the app)',
    'Control (e.g., How much control do you want companies to have over your data)',
    'Security (e.g., How protected do you want your information to be)',
    'Ethics',
    'Group (e.g., Do you use apps more if people you know use it)',
    'Family Values (e.g., Importance of family relationships)',
    'Religious Beliefs (e.g., Religious teachings influencing behavior)',
    'Media and news outlets(e.g., Influence local media has)',
    'Educational Backgrounds (e.g., Level and type of education influencing privacy)'
]

# Define demographic columns (adjust these based on your CSV)
demographic_columns = [
    'location', 'Q13.1', 'Q13.2', 'Q13.3', 'Q13.4', 'Q13.5', 'Q13.6'
]

# Placeholder for privacy_value_columns (update these to match your CSV)
privacy_value_columns = [
    'Q5.1_1', 'Q5.1_2', 'Q5.1_3', 'Q5.1_4', 'Q5.1_5', 'Q5.1_6', 'Q5.1_7', 'Q5.1_8',
    'Q5.1_9', 'Q5.1_10', 'Q5.1_11', 'Q5.1_12', 'Q5.1_13', 'Q5.1_14', 'Q5.1_15',
    'Q5.1_16', 'Q5.1_17'
]  # Assuming 17 columns for Q5.1; adjust to actual names

def create_chart(data, category, demographic, colors):
    if data.empty:
        print(f"No data available for {category} in {demographic}")
        return

    fig, ax = plt.subplots(figsize=(18, 6))
    y_pos = np.arange(len(data.index)) * 1.2
    bar_height = 0.8
    cumulative = np.zeros(len(data.index))

    for i, level in enumerate(importance_levels):
        values = data[level].values
        ax.barh(y_pos, values, left=cumulative, height=bar_height, label=level, color=colors[i])
        cumulative += values

    ax.set_xlabel('', fontweight='bold', fontname='Times New Roman')
    ax.set_title(f'Importance of Privacy Values - {demographic}: {category}',
                 fontweight='bold', fontsize=22, fontname='Times New Roman', pad=20)
    ax.set_xlim(0, 100)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    for i, container in enumerate(ax.containers):
        text_color = 'black' if i < 3 else 'white'
        ax.bar_label(container, label_type='center', fmt='%.1f%%', fontname='Times New Roman',
                     fontweight='bold', fontsize=12, padding=4, color=text_color)

    ax.set_yticks(y_pos)
    ax.set_yticklabels(data.index, fontweight='bold', fontname='Times New Roman', fontsize=14)
    ax.set_ylim(y_pos.min() - bar_height/2, y_pos.max() + bar_height/2)

    legend_patches = [mpatches.Patch(color=color, label=level) for color, level in zip(colors, importance_levels)]
    fig.legend(handles=legend_patches, loc='lower center', bbox_to_anchor=(0.5, -0.15),
               ncol=5, fontsize=12, frameon=False)

    plt.tight_layout()

    output_dir = 'figures/importance_analysis/privacy_values'
    os.makedirs(output_dir, exist_ok=True)
    safe_category = re.sub(r'[<>:"/\\|?*]', '_', str(category))
    safe_demographic = re.sub(r'[<>:"/\\|?*]', '_', demographic)
    output_path = os.path.join(output_dir, f'Q5.1_{safe_demographic}_{safe_category}.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()

def create_combined_chart(all_data, demographic, colors, graphs_per_row=3, categories_to_exclude=None):
    if categories_to_exclude is None:
        categories_to_exclude = []

    filtered_data = {k: v for k, v in all_data.items() if k not in categories_to_exclude}
    num_categories = len(filtered_data)

    if num_categories == 0:
        print(f"No categories to plot for {demographic} after excluding {categories_to_exclude}")
        return

    num_rows = (num_categories + graphs_per_row - 1) // graphs_per_row
    num_categories_adjusted = num_rows * graphs_per_row if num_categories % graphs_per_row != 0 else num_categories

    fig, axes = plt.subplots(num_rows, graphs_per_row, figsize=(8 * graphs_per_row, 6 * num_rows), sharey=False)
    fig.suptitle(f'Importance of Privacy Values - {demographic}', 
                 fontweight='bold', fontsize=22, fontname='Times New Roman', y=0.98 if num_rows > 1 else 1.05)

    category_items = list(filtered_data.items())
    for idx, (category, data) in enumerate(category_items):
        if idx >= num_categories_adjusted:
            break

        row = idx // graphs_per_row
        col = idx % graphs_per_row
        ax = axes[row, col] if num_rows > 1 else (axes[col] if graphs_per_row > 1 else axes)

        y_pos = np.arange(len(data.index)) * 1.2
        bar_height = 0.8
        cumulative = np.zeros(len(data.index))

        for i, level in enumerate(importance_levels):
            values = data[level].values
            ax.barh(y_pos, values, left=cumulative, height=bar_height, label=level, color=colors[i])
            cumulative += values

        ax.set_xlabel('', fontweight='bold', fontname='Times New Roman')
        ax.set_title(str(category), fontweight='bold', fontsize=18, fontname='Times New Roman', pad=20)
        ax.set_xlim(0, 100)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)

        if col == 0:
            ax.set_yticks(y_pos)
            ax.set_yticklabels(data.index, fontweight='bold', fontname='Times New Roman', fontsize=14)
        else:
            ax.set_yticks([])
            ax.set_yticklabels([])

        for i, container in enumerate(ax.containers):
            text_color = 'black' if i < 3 else 'white'
            ax.bar_label(container, label_type='center', fmt='%.1f%%', fontname='Times New Roman',
                         fontweight='bold', fontsize=12, padding=4, color=text_color)

        ax.set_ylim(y_pos.min() - bar_height/2, y_pos.max() + bar_height/2)

    for idx in range(num_categories_adjusted, num_rows * graphs_per_row):
        row = idx // graphs_per_row
        col = idx % graphs_per_row
        if num_rows > 1:
            fig.delaxes(axes[row, col])
        elif graphs_per_row > 1:
            fig.delaxes(axes[col])

    legend_patches = [mpatches.Patch(color=color, label=level) for color, level in zip(colors, importance_levels)]
    fig.legend(handles=legend_patches, loc='lower center', bbox_to_anchor=(0.5, -0.01),
               ncol=min(graphs_per_row, 5), fontsize=12, frameon=False)

    plt.tight_layout()
    plt.subplots_adjust(top=0.15 if num_rows > 1 else 0.25, hspace=0.6 if num_rows > 1 else 0.25, wspace=0.2)

    output_dir = 'figures/importance_analysis/privacy_values'
    os.makedirs(output_dir, exist_ok=True)
    safe_demographic = re.sub(r'[<>:"/\\|?*]', '_', demographic)
    output_path = os.path.join(output_dir, f'Q5.1_{safe_demographic}_combined.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()

def process_and_create_charts(df, demographic_col, colors):
    all_data = {}
    if demographic_col not in df.columns:
        print(f"Error: Column '{demographic_col}' not found in DataFrame. Available columns: {df.columns.tolist()}")
        return all_data

    categories = df[demographic_col].dropna().unique()
    df['GroupedCategory'] = df[demographic_col]

    for category in categories:
        df_category = df[df['GroupedCategory'] == category]
        data = pd.DataFrame(index=privacy_values, columns=importance_levels)

        for idx, value in enumerate(privacy_values):
            if idx < len(privacy_value_columns):
                column = privacy_value_columns[idx]
                if column in df_category.columns:
                    valid_responses = df_category[column].dropna()
                    total_respondents = len(valid_responses)
                    if total_respondents > 0:
                        counts = valid_responses.value_counts()
                        for level, count in counts.items():
                            if level in importance_levels:
                                data.loc[value, level] = (count / total_respondents) * 100

        data = data.fillna(0)
        data = data[importance_levels]
        all_data[category] = data
        create_chart(data, category, demographic_col, colors)

    return all_data

def print_stats(all_data, demographic):
    stats_summary = f"Statistics for {demographic}:\n\n"
    for category, data in all_data.items():
        stats_summary += f"{category}:\n"
        total_respondents = data.iloc[0].sum() / 100
        for value in privacy_values:
            stats_summary += f"{value}:\n"
            for level in importance_levels:
                percentage = data.loc[value, level]
                count = round(percentage * total_respondents / 100)
                stats_summary += f" {level}: {count:.0f} ({percentage:.1f}%),"
            stats_summary += "\n"
        stats_summary += f"Total respondents: {total_respondents:.0f}\n" + "="*50 + "\n"
    print(stats_summary)

def process_combined_data(df, colors):
    all_data = {}
    data = pd.DataFrame(index=privacy_values, columns=importance_levels)

    for idx, value in enumerate(privacy_values):
        if idx < len(privacy_value_columns):
            column = privacy_value_columns[idx]
            if column in df.columns:
                valid_responses = df[column].dropna()
                total_respondents = len(valid_responses)
                if total_respondents > 0:
                    counts = valid_responses.value_counts()
                    for level, count in counts.items():
                        if level in importance_levels:
                            data.loc[value, level] = (count / total_respondents) * 100

    data = data.fillna(0)
    data = data[importance_levels]
    all_data['All Participants'] = data
    create_chart(data, 'All Participants', 'Combined', colors)
    return all_data

# Define colors
colors = ['#FFFF99', '#FFEDA0', '#C7E9B4', '#7FCDBB', '#2C7FB8']

# Main execution
df = read_and_clean_csv("../data/survey_finalized.csv")
if df is not None:
    print(f"DataFrame shape: {df.shape}")
    print(f"Column names: {df.columns.tolist()}")
    print("Processing all participants...")
    combined_data = process_combined_data(df, colors)
    print_stats(combined_data, 'All Participants')

    for demographic_col in demographic_columns:
        print(f"Processing {demographic_col}...")
        all_data = process_and_create_charts(df, demographic_col, colors)
        create_combined_chart(all_data, demographic_col, colors, graphs_per_row=3)
        print_stats(all_data, demographic_col)

    print("All graphs have been saved in the designated folder.")

Successfully read ../data/survey_finalized.csv.
DataFrame shape: (183, 139)
Column names: ['location', 'Q1.1', 'Q2.1', 'Q3.1_1', 'Q3.1_2', 'Q3.1_3', 'Q3.1_4', 'Q3.1_5', 'Q3.1_6', 'Q3.1_7', 'Q3.1_8', 'Q3.1_9', 'Q3.1_10', 'Q3.1_11', 'Q3.1_12', 'Q3.1_13', 'Q3.1_14', 'Q3.1_15', 'Q3.1_16', 'Q3.1_17', 'Q3.1_18', 'Q3.1_19', 'Q3.1_20', 'Q3.1_21', 'Q3.1_22', 'Q3.1_23', 'Q3.1_24', 'Q3.1_24_TEXT', 'Q3.2', 'Q3.2_1_TEXT', 'Q3.3', 'Q3.4', 'Q3.5', 'Q3.6_1', 'Q3.6_2', 'Q3.6_3', 'Q3.6_4', 'Q3.6_5', 'Q3.6_6', 'Q3.6_7', 'Q3.6_7_TEXT', 'Q3.7_1', 'Q3.7_2', 'Q3.7_3', 'Q3.7_4', 'Q3.7_5', 'Q3.7_6', 'Q4.1', 'Q4.2', 'Q5.1_1', 'Q5.1_2', 'Q5.1_3', 'Q5.1_4', 'Q5.1_5', 'Q5.1_6', 'Q5.1_7', 'Q5.1_8', 'Q5.1_9', 'Q5.1_10', 'Q5.1_11', 'Q5.1_12', 'Q5.1_13', 'Q5.1_14', 'Q5.1_15', 'Q5.1_16', 'Q5.1_17', 'Q5.1_18', 'Q5.1_18_TEXT', 'Q5.2', 'Q6.1', 'Q6.1_1_TEXT', 'Q6.2', 'Q6.2_5_TEXT', 'Q6.3', 'Q6.4', 'Q6.5', 'Q6.6', 'Q6.7', 'Q6.8', 'Q6.9', 'Q6.10', 'Q6.11', 'Q6.12', 'Q7.1', 'Q7.2', 'Q7.3', 'Q7.4', 'Q7.5_1', 'Q7.5_2', 'Q7.5_3'

  data = data.fillna(0)


Statistics for All Participants:

All Participants:
Individualism (e.g., Independence):
 Not at all Important: 0 (3.2%), Slightly Important: 0 (9.7%), Moderately Important: 0 (18.1%), Very Important: 0 (38.1%), Extremely Important: 0 (30.3%),
Collectivism (e.g., Community influence):
 Not at all Important: 0 (9.0%), Slightly Important: 0 (30.3%), Moderately Important: 0 (32.9%), Very Important: 0 (23.2%), Extremely Important: 0 (3.9%),
Power Distance (e.g., Income inequality):
 Not at all Important: 0 (12.3%), Slightly Important: 0 (21.4%), Moderately Important: 0 (29.9%), Very Important: 0 (24.7%), Extremely Important: 0 (11.0%),
Uncertainty Avoidance (e.g., How well you handle the unknown):
 Not at all Important: 0 (5.8%), Slightly Important: 0 (16.1%), Moderately Important: 0 (28.4%), Very Important: 0 (33.5%), Extremely Important: 0 (15.5%),
Masculinity (e.g., Focus on achieving goals):
 Not at all Important: 0 (29.4%), Slightly Important: 0 (21.6%), Moderately Important: 0 (23.5%)

  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)


Statistics for location:

asia, europe, north america:
Individualism (e.g., Independence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Collectivism (e.g., Community influence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Power Distance (e.g., Income inequality):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Uncertainty Avoidance (e.g., How well you handle the unknown):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Masculinity (e.g., Focus on achieving goals):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Importan

  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values


Statistics for Q13.1:

What is your age?:
Individualism (e.g., Independence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Collectivism (e.g., Community influence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Power Distance (e.g., Income inequality):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Uncertainty Avoidance (e.g., How well you handle the unknown):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Masculinity (e.g., Focus on achieving goals):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), 

  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)


Statistics for Q13.2:

What is your gender?:
Individualism (e.g., Independence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Collectivism (e.g., Community influence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Power Distance (e.g., Income inequality):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Uncertainty Avoidance (e.g., How well you handle the unknown):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Masculinity (e.g., Focus on achieving goals):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%

  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)


Statistics for Q13.3:

What is your highest level of education? - Selected Choice:
Individualism (e.g., Independence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Collectivism (e.g., Community influence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Power Distance (e.g., Income inequality):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Uncertainty Avoidance (e.g., How well you handle the unknown):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Masculinity (e.g., Focus on achieving goals):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Import

  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)


Statistics for Q13.4:

If you have obtained a degree, which fields did you study? (Select all that apply) - Selected Choice:
Individualism (e.g., Independence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Collectivism (e.g., Community influence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Power Distance (e.g., Income inequality):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Uncertainty Avoidance (e.g., How well you handle the unknown):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Masculinity (e.g., Focus on achieving goals):
 Not at all Important: 0 (0.0%), Sligh

  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)


Statistics for Q13.5:

What is your religious affliation? - Selected Choice:
Individualism (e.g., Independence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Collectivism (e.g., Community influence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Power Distance (e.g., Income inequality):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Uncertainty Avoidance (e.g., How well you handle the unknown):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Masculinity (e.g., Focus on achieving goals):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0

  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)
  data = data.fillna(0)


Statistics for Q13.6:

How Important is religion in your daily life? - Selected Choice:
Individualism (e.g., Independence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Collectivism (e.g., Community influence):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Power Distance (e.g., Income inequality):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Uncertainty Avoidance (e.g., How well you handle the unknown):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately Important: 0 (0.0%), Very Important: 0 (0.0%), Extremely Important: 0 (0.0%),
Masculinity (e.g., Focus on achieving goals):
 Not at all Important: 0 (0.0%), Slightly Important: 0 (0.0%), Moderately I

### C Familiarity with Ethics Initiatives 


In [None]:
def read_and_clean_csv(file_path):
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
        print(f"Successfully read {file_path}.")
        return df
    except UnicodeDecodeError:
        print(f"Failed to read {file_path} with utf-8 encoding.")
        return None

def create_chart(data, category, demographic):
    if data.empty:
        print(f"No data available for {category} in {demographic}")
        return

    fig, ax = plt.subplots(figsize=(18, 6))  # Increase figure size for larger output

    y_pos = np.arange(len(data.index)) * 1.2  # Increase spacing between bars
    bar_height = 0.8  # Bar height

    cumulative = np.zeros(len(data.index))

    for i, level in enumerate(familiarity_levels):
        values = data[level].values
        ax.barh(y_pos, values, left=cumulative, height=bar_height, label=level, color=colors[i])
        cumulative += values

    ax.set_xlabel('', fontweight='bold', fontname='Times New Roman')
    ax.set_title(f'Familiarity with AI Governance Initiatives - {demographic}: {category}',
                 fontweight='bold', fontsize=22, fontname='Times New Roman', pad=20)
    ax.set_xlim(0, 100)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    for i, container in enumerate(ax.containers):
        text_color = 'black' if i < 3 else 'white'
        ax.bar_label(container, label_type='center', fmt='%.1f%%', fontname='Times New Roman',
                     fontweight='bold', fontsize=12, padding=4, color=text_color)

    ax.set_yticks(y_pos)
    ax.set_yticklabels(data.index, fontweight='bold', fontname='Times New Roman', fontsize=14)
    ax.set_ylim(y_pos.min() - bar_height/2, y_pos.max() + bar_height/2)

    legend_patches = [mpatches.Patch(color=color, label=level) for color, level in zip(colors, familiarity_levels)]
    fig.legend(handles=legend_patches, loc='lower center', bbox_to_anchor=(0.5, -0.15),
               ncol=5, fontsize=12, frameon=False)

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.25)

    # Save the chart as an image file
    safe_category = re.sub(r'[<>:"/\\|?*]', '_', category)
    safe_demographic = re.sub(r'[<>:"/\\|?*]', '_', demographic)
    plt.savefig(f'figures/familiarity_analysis/governance_familiarity/C.2_{safe_demographic}_{safe_category}.png', dpi=300, bbox_inches='tight')
    plt.close()

def create_combined_chart(all_data, demographic, graphs_per_row=3, categories_to_exclude=None):
    if categories_to_exclude is None:
        categories_to_exclude = []

    filtered_data = {k: v for k, v in all_data.items() if k not in categories_to_exclude}
    num_categories = len(filtered_data)

    if num_categories == 0:
        print(f"No categories to plot for {demographic} after excluding {categories_to_exclude}")
        return

    num_rows = (num_categories + graphs_per_row - 1) // graphs_per_row
    num_categories_adjusted = num_rows * graphs_per_row if num_categories % graphs_per_row != 0 else num_categories

    fig, axes = plt.subplots(num_rows, graphs_per_row, figsize=(8 * graphs_per_row, 6 * num_rows), sharey=False)
    fig.suptitle(f'Familiarity with AI Governance Initiatives - {demographic}', 
                 fontweight='bold', fontsize=22, fontname='Times New Roman', y=0.98 if num_rows > 1 else 1.05)

    category_items = list(filtered_data.items())
    for idx, (category, data) in enumerate(category_items):
        if idx >= num_categories_adjusted:
            break

        row = idx // graphs_per_row
        col = idx % graphs_per_row
        ax = axes[row, col] if num_rows > 1 else (axes[col] if graphs_per_row > 1 else axes)

        y_pos = np.arange(len(data.index)) * 1.2
        bar_height = 0.8
        cumulative = np.zeros(len(data.index))

        for i, level in enumerate(familiarity_levels):
            values = data[level].values
            ax.barh(y_pos, values, left=cumulative, height=bar_height, label=level, color=colors[i])
            cumulative += values

        ax.set_xlabel('', fontweight='bold', fontname='Times New Roman')
        ax.set_title(category, fontweight='bold', fontsize=18, fontname='Times New Roman', pad=20)
        ax.set_xlim(0, 100)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)

        if col == 0:
            ax.set_yticks(y_pos)
            ax.set_yticklabels(data.index, fontweight='bold', fontname='Times New Roman', fontsize=14)
        else:
            ax.set_yticks([])
            ax.set_yticklabels([])

        for i, container in enumerate(ax.containers):
            text_color = 'black' if i < 3 else 'white'
            ax.bar_label(container, label_type='center', fmt='%.1f%%', fontname='Times New Roman',
                         fontweight='bold', fontsize=12, padding=4, color=text_color)

        ax.set_ylim(y_pos.min() - bar_height/2, y_pos.max() + bar_height/2)

    # Remove any unused subplots
    for idx in range(num_categories_adjusted, num_rows * graphs_per_row):
        row = idx // graphs_per_row
        col = idx % graphs_per_row
        if num_rows > 1:
            fig.delaxes(axes[row, col])
        elif graphs_per_row > 1:
            fig.delaxes(axes[col])

    legend_patches = [mpatches.Patch(color=color, label=level) for color, level in zip(colors, familiarity_levels)]
    fig.legend(handles=legend_patches, loc='lower center', bbox_to_anchor=(0.5, -0.01),
               ncol=min(graphs_per_row, 5), fontsize=12, frameon=False)

    plt.tight_layout()
    plt.subplots_adjust(bottom=0.15 if num_rows > 1 else 0.25, hspace=0.6 if num_rows > 1 else 0.25, wspace=0.2)

    safe_demographic = re.sub(r'[<>:"/\\|?*]', '_', demographic)
    plt.savefig(f'figures/familiarity_analysis/governance_familiarity/C.2_{safe_demographic}_combined.png', dpi=300, bbox_inches='tight')
    plt.close()

def process_and_create_charts(df, demographic):
    all_data = {}

    if demographics[demographic]['mapping']:
        categories = set(demographics[demographic]['mapping'].values())
        df['GroupedCategory'] = df.iloc[:, demographics[demographic]['column']].map(demographics[demographic]['mapping'])
    else:
        categories = df.iloc[:, demographics[demographic]['column']].dropna().unique()
        df['GroupedCategory'] = df.iloc[:, demographics[demographic]['column']]

    for category in categories:
        df_category = df[df['GroupedCategory'] == category]
        data = pd.DataFrame(index=regulation_names, columns=familiarity_levels)

        for idx, initiative in enumerate(regulation_names):
            if idx < len(regulation_columns):
                column = regulation_columns[idx]
                if column < df_category.shape[1]:
                    valid_responses = df_category.iloc[:, column].dropna()
                    total_respondents = len(valid_responses)
                    if total_respondents > 0:
                        counts = valid_responses.value_counts()
                        for level, count in counts.items():
                            if level in familiarity_levels:
                                data.loc[initiative, level] = (count / total_respondents) * 100
                    else:
                        print(f"Column index {column} is out of range for {initiative} in {category}")

        data = data.fillna(0)
        data = data[familiarity_levels]
        all_data[category] = data
        create_chart(data, category, demographic)

    return all_data

def print_stats(all_data, demographic):
    stats_summary = f"Statistics for {demographic}:\n\n"
    for category, data in all_data.items():
        stats_summary += f"{category}:\n"
        total_respondents = data.iloc[0].sum() / 100
        for initiative in regulation_names:
            stats_summary += f"{initiative}:\n"
            for level in familiarity_levels:
                percentage = data.loc[initiative, level]
                count = round(percentage * total_respondents / 100)
                stats_summary += f" {level}: {count:.0f} ({percentage:.1f}%),"
            stats_summary += "\n"
        stats_summary += f"Total respondents: {total_respondents:.0f}\n" + "="*50 + "\n"

    # Instead of writing to a text file, we simply print the summary
    print(stats_summary)

def process_combined_data(df):
    all_data = {}
    data = pd.DataFrame(index=regulation_names, columns=familiarity_levels)

    for idx, initiative in enumerate(regulation_names):
        if idx < len(regulation_columns):
            column = regulation_columns[idx]
            if column < df.shape[1]:
                valid_responses = df.iloc[:, column].dropna()
                total_respondents = len(valid_responses)
                if total_respondents > 0:
                    counts = valid_responses.value_counts()
                    for level, count in counts.items():
                        if level in familiarity_levels:
                            data.loc[initiative, level] = (count / total_respondents) * 100

    data = data.fillna(0)
    data = data[familiarity_levels]
    all_data['All Participants'] = data
    create_chart(data, 'All Participants', 'Combined')
    return all_data

# Main execution
df = read_and_clean_csv(file_path)
if df is not None:
    print("Processing all participants...")
    combined_data = process_combined_data(df)
    print_stats(combined_data, 'All Participants')

    for demographic in demographics:
        print(f"Processing {demographic}...")
        all_data = process_and_create_charts(df, demographic)
        # Generate combined charts across different categories within a demographic, excluding 'Other' for 'Role'
        categories_to_exclude = ['Other'] if demographic == 'Role' else []
        create_combined_chart(all_data, demographic, graphs_per_row=3, categories_to_exclude=categories_to_exclude)
        print_stats(all_data, demographic)

    print("All graphs have been saved in the designated folder.")



### Rank Demographics Familiarity with Principles and Regultions (Requires above code to be run first), input demographic at bottom of cell


In [7]:

###################
# Rank demographics familiarities with principles
###################

def read_and_clean_csv(file_path):
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
        print(f"Successfully read {file_path}.")
        return df
    except UnicodeDecodeError:
        print(f"Failed to read {file_path} with utf-8 encoding.")
        return None

def rank_demographics_by_principle(df, track, demographic_key):
    ranked_data = {}

    # Get the demographic details from the selected key
    demographic = demographics[demographic_key]
    print(f"\nRanking {demographic_key} demographics for Track {track}...\n")

    # Process each principle and demographic to calculate the percentage for "Extremely Familiar" and "Moderately Familiar"
    for principle_idx, principle in enumerate(principles):
        ranked_data[principle] = []

        demographic_data = {category: 0 for category in set(demographic['mapping'].values())}  # Initialize with 0 for all categories

        df['GroupedCategory'] = df.iloc[:, demographic['column']].map(demographic['mapping'])

        # Iterate over each category within the demographic (e.g., 1-5 Employees, 6-20 Employees, etc.)
        for category in set(demographic['mapping'].values()):
            df_category = df[df['GroupedCategory'] == category]
            valid_columns = principle_columns[track]
            df_track = df_category.dropna(subset=[df_category.columns[valid_columns[principle_idx]]])

            if not df_track.empty:
                # Count "Extremely Familiar" and "Moderately Familiar" responses
                counts = df_track.iloc[:, valid_columns[principle_idx]].value_counts()
                extremely_familiar = counts.get('Extremely Familiar', 0)
                moderately_familiar = counts.get('Moderately Familiar', 0)
                somewhat_familiar = counts.get('Somewhat Familiar', 0)
                total_respondents = len(df_track)

                total_familiar = ((extremely_familiar + moderately_familiar + somewhat_familiar) / total_respondents) * 100
                demographic_data[category] = total_familiar

        # Rank the categories for this demographic and principle
        ranked_categories = sorted(demographic_data.items(), key=lambda x: x[1], reverse=True)
        ranked_data[principle].append((demographic_key, ranked_categories))

    return ranked_data

def display_ranked_data(ranked_data):
    for principle, demographic_rankings in ranked_data.items():
        print(f"\n### {principle} Rankings ###\n")
        for demographic, rankings in demographic_rankings:
            print(f"{demographic}:")
            for category, percentage in rankings:
                if percentage > 0:  # Only display categories with responses
                    print(f"  {category}: {percentage:.1f}%")
            print("\n")

# Main execution
df = read_and_clean_csv(file_path)
if df is not None:
    # Select the demographic to rank (e.g., 'Company Size')
    selected_demographic = 'Location' # This can be changed to other demographic keys like 'Location', etc.

    for track in ['B']:
        print(f"Processing Track {track} for {selected_demographic}...")
        ranked_data = rank_demographics_by_principle(df, track, selected_demographic)
        display_ranked_data(ranked_data)


Successfully read c:\Users\Baldw\Desktop\UMO\PERC\qualtrics_analysis\data\survey_finalized.csv.
Processing Track B for Location...

Ranking Location demographics for Track B...


### Respect for Human Rights Rankings ###

Location:



### Data Protection and Right to Privacy Rankings ###

Location:



### Harm Prevention and Beneficence Rankings ###

Location:



### Non-Discrimination and Freedom of Privileges Rankings ###

Location:



### Fairness and Justice Rankings ###

Location:



### Transparency and Explainability of AI Systems Rankings ###

Location:



### Accountability and Responsibility Rankings ###

Location:



### Democracy and Rule of Law Rankings ###

Location:



### Environment and Social Responsibility Rankings ###

Location:




In [None]:
###################
# Rank demographics familiarities with regulations
###################

def rank_by_familiarity_and_average(df, demographic):
    # Ensure the demographic is valid
    if demographic not in demographics:
        print(f"Invalid demographic: {demographic}")
        return
    
    # Prepare to store the ranking data and average scores
    ranking_data = {}
    avg_familiarity_by_category = {}

    # Group by the selected demographic category
    df['GroupedCategory'] = df.iloc[:, demographics[demographic]['column']].map(demographics[demographic]['mapping'])

    for initiative in regulation_names:
        initiative_scores = {}

        # For each category within the demographic (e.g., US, Europe for Location)
        for category in set(demographics[demographic]['mapping'].values()):
            df_category = df[df['GroupedCategory'] == category]
            if df_category.empty:
                continue
            
            # Calculate the percentage of respondents for "Extremely Familiar", "Moderately Familiar", and "Somewhat Familiar"
            total_respondents = len(df_category)
            if total_respondents == 0:
                continue
            
            familiarity_count = df_category.iloc[:, regulation_names.index(initiative) + regulation_columns[0]].value_counts()
            at_least_somewhat_familiar = familiarity_count.get('Extremely Familiar', 0) + familiarity_count.get('Moderately Familiar', 0) + familiarity_count.get('Somewhat Familiar', 0)
            
            percentage = (at_least_somewhat_familiar / total_respondents) * 100
            initiative_scores[category] = percentage

            # Add score to the average familiarity for each category
            if category not in avg_familiarity_by_category:
                avg_familiarity_by_category[category] = []
            avg_familiarity_by_category[category].append(percentage)
        
        # Sort the categories for this regulation based on the percentage
        sorted_scores = sorted(initiative_scores.items(), key=lambda x: x[1], reverse=True)
        ranking_data[initiative] = sorted_scores

    # Print out the ranking results for each regulation
    for initiative, rankings in ranking_data.items():
        print(f"\nRanking for {initiative} based on '{demographic}':")
        for idx, (category, score) in enumerate(rankings, start=1):
            print(f"{idx}. {category}: {score:.2f}% at least somewhat familiar")

    # Calculate and print the average familiarity score for each category across all principles
    avg_familiarity_scores = {
        category: sum(scores) / len(scores) for category, scores in avg_familiarity_by_category.items()
    }

    # Sort the categories based on their average scores
    sorted_avg_scores = sorted(avg_familiarity_scores.items(), key=lambda x: x[1], reverse=True)

    print(f"\nAverage familiarity across all principles for {demographic}:")
    for idx, (category, avg_score) in enumerate(sorted_avg_scores, start=1):
        print(f"{idx}. {category}: {avg_score:.2f}% average familiarity")

# Example usage:
demographic_input = 'Location'  # Change to the desired demographic to analyze
rank_by_familiarity_and_average(df, demographic_input)


### Familiarity Heatmaps 



In [None]:
####################################################################
# Build heatmaps of the regions to their familiarity of principles and regulations
# Survey Questions (P8) -> (B.2.1 + A.1.1) combined and (P10) -> Regulations
###################################################################
# Load the CSV file into a DataFrame
df = pd.read_csv('accepted_maybe_responses.csv')

# Use the 28th column for the region (index 27 because Python uses 0-based indexing)
region_column = df.columns[28]

# Use columns CB through CJ (79:88) and AU through BC (46:55) for Likert scale responses (principles)
likert_columns_1 = df.columns[79:88]  # CB through CJ
likert_columns_2 = df.columns[46:55]  # AU through BC

# Use columns HM through HT (220:228) for Likert scale responses (regulations)
regulation_columns = df.columns[220:228]  # HM through HT

# Create a mapping for the Likert scale responses to numeric values
likert_mapping = {
    "Not Familiar At All": 1,
    "Slightly Familiar": 2,
    "Somewhat Familiar": 3,
    "Moderately Familiar": 4,
    "Extremely Familiar": 5
}

# Create a mapping for the column names to the principles
principle_mapping = {
    df.columns[79]: "Respect for Human Rights",
    df.columns[80]: "Data Protection and Right to Privacy",
    df.columns[81]: "Harm Prevention and Beneficence",
    df.columns[82]: "Non-Discrimination and Freedom of Privileges",
    df.columns[83]: "Fairness and Justice",
    df.columns[84]: "Transparency and Explainability of AI Systems",
    df.columns[85]: "Accountability and Responsibility",
    df.columns[86]: "Democracy and Rule of Law",
    df.columns[87]: "Environment and Social Responsibility"
}

# Create a mapping for the column names to the regulations
regulation_mapping = {
    df.columns[220]: "European Union Artificial Intelligence Act",
    df.columns[221]: "US Executive Order on Safe, Secure and Trustworthy AI",
    df.columns[222]: "US Algorithmic Accountability Act",
    df.columns[223]: "NIST Technical AI Standards",
    df.columns[224]: "NIST AI Risk Management Framework",
    df.columns[225]: "UN General Assembly's Resolution on AI Systems",
    df.columns[226]: "OECD Principles for Trustworthy AI",
    df.columns[227]: "G20 AI Principles"
}

# Apply the mapping to all Likert scale columns
for column in list(likert_columns_1) + list(likert_columns_2) + list(regulation_columns):
    df[column] = df[column].map(likert_mapping)

# Combine the two sets of Likert columns for principles
for i in range(9):
    combined_column = f'combined_principle_{i}'
    df[combined_column] = df[[likert_columns_1[i], likert_columns_2[i]]].mean(axis=1)

combined_principle_columns = [f'combined_principle_{i}' for i in range(9)]

# Function to create and display heatmap
def plot_heatmap(data, title, xlabel):
    # Remove rows with all NaN values
    data_clean = data.dropna(how='all')
    
    if data_clean.empty:
        print(f"No data to plot for: {title}")
        return
    
    plt.figure(figsize=(20, 12))
    sns.heatmap(data_clean, annot=True, cmap='coolwarm', linewidths=.5, fmt=".2f", cbar_kws={'label': 'Familiarity Level'})
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel('Region / Country')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Calculate and plot for principles
average_familiarity_by_region_principles = df.groupby(region_column)[combined_principle_columns].mean()
average_familiarity_by_region_principles.columns = [principle_mapping[col] for col in likert_columns_1]
plot_heatmap(average_familiarity_by_region_principles, 'Familiarity with AI Ethics Principles by Region (Combined)', 'AI Ethics Principles')

# Calculate and plot for regulations
average_familiarity_by_region_regulations = df.groupby(region_column)[regulation_columns].mean()
average_familiarity_by_region_regulations = average_familiarity_by_region_regulations.rename(columns=regulation_mapping)
plot_heatmap(average_familiarity_by_region_regulations, 'Familiarity with AI Regulations by Region', 'AI Regulations')

# Create a new DataFrame with grouped regions
def group_regions(region):
    if region in ['North America', 'EU/UK/EEA']:
        return region
    else:
        return 'Other'

grouped_df = df.copy()
grouped_df[region_column] = grouped_df[region_column].apply(group_regions)

# Calculate and plot for grouped principles
grouped_average_familiarity_principles = grouped_df.groupby(region_column)[combined_principle_columns].mean()
grouped_average_familiarity_principles.columns = [principle_mapping[col] for col in likert_columns_1]
plot_heatmap(grouped_average_familiarity_principles, 'Familiarity with AI Ethics Principles by Grouped Regions (Combined)', 'AI Ethics Principles')

# Calculate and plot for grouped regulations
grouped_average_familiarity_regulations = grouped_df.groupby(region_column)[regulation_columns].mean()
grouped_average_familiarity_regulations = grouped_average_familiarity_regulations.rename(columns=regulation_mapping)
plot_heatmap(grouped_average_familiarity_regulations, 'Familiarity with AI Regulations by Grouped Regions', 'AI Regulations')

# Save the data to CSV for further review
average_familiarity_by_region_principles.to_csv('familiarity_by_region_and_principle_combined.csv')
average_familiarity_by_region_regulations.to_csv('familiarity_by_region_and_regulation.csv')
grouped_average_familiarity_principles.to_csv('familiarity_by_grouped_region_and_principle_combined.csv')
grouped_average_familiarity_regulations.to_csv('familiarity_by_grouped_region_and_regulation.csv')

In [None]:
####################################################################
# Build heatmaps of the roles to their familiarity with principles and regulations
# Survey Questions (P9) -> (B.2.1 + A.1.1) and (P10) -> Regulations
####################################################################

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Load the CSV file into a DataFrame
df = pd.read_csv('accepted_maybe_responses.csv')

# Use the 31st column for the role (index 30 because Python uses 0-based indexing)
role_column = df.columns[30]

# Use columns CB through CJ (79:88) and AU through BC (46:55) for Likert scale responses (principles)
likert_columns_1 = df.columns[79:88]  # CB through CJ
likert_columns_2 = df.columns[46:55]  # AU through BC

# Use columns HM through HT (213:221) for Likert scale responses (regulations)
regulation_columns = df.columns[220:228]  # HM through HT

# Create a mapping for the Likert scale responses to numeric values
likert_mapping = {
    "Not Familiar At All": 1,
    "Slightly Familiar": 2,
    "Somewhat Familiar": 3,
    "Moderately Familiar": 4,
    "Extremely Familiar": 5
}

# Create a mapping for the column names to the principles
principle_mapping = {
    0: "Respect for Human Rights",
    1: "Data Protection and Right to Privacy",
    2: "Harm Prevention and Beneficence",
    3: "Non-Discrimination and Freedom of Privileges",
    4: "Fairness and Justice",
    5: "Transparency and Explainability of AI Systems",
    6: "Accountability and Responsibility",
    7: "Democracy and Rule of Law",
    8: "Environment and Social Responsibility"
}

# Create a mapping for the column names to the regulations
regulation_mapping = {
    df.columns[220]: "European Union Artificial Intelligence Act",
    df.columns[221]: "US Executive Order on Safe, Secure and Trustworthy AI",
    df.columns[222]: "US Algorithmic Accountability Act",
    df.columns[223]: "NIST Technical AI Standards",
    df.columns[224]: "NIST AI Risk Management Framework",
    df.columns[225]: "UN General Assembly's Resolution on AI Systems",
    df.columns[226]: "OECD Principles for Trustworthy AI",
    df.columns[227]: "G20 AI Principles"
}

# Apply the mapping to all Likert scale columns
for column in list(likert_columns_1) + list(likert_columns_2) + list(regulation_columns):
    df[column] = df[column].map(likert_mapping)

# Drop rows where the role is empty
df = df.dropna(subset=[role_column])

# Calculate the average of both sets of responses for principles
for i in range(9):
    df[f'combined_principle_{i}'] = df[[likert_columns_1[i], likert_columns_2[i]]].mean(axis=1)

# Calculate the mean familiarity for each role and principle
combined_principle_columns = [f'combined_principle_{i}' for i in range(9)]
average_familiarity_by_role_principles = df.groupby(role_column)[combined_principle_columns].mean()

# Rename the columns to the principles
average_familiarity_by_role_principles.columns = [principle_mapping[i] for i in range(9)]

# Calculate the mean familiarity for each role and regulation
average_familiarity_by_role_regulations = df.groupby(role_column)[regulation_columns].mean()

# Rename the columns to the regulations
average_familiarity_by_role_regulations = average_familiarity_by_role_regulations.rename(columns=regulation_mapping)

# Function to plot heatmap
def plot_heatmap(data, title, ylabel):
    if data.empty:
        print(f"No valid data to plot for {title}. Please check your input data.")
    else:
        # Remove any rows or columns that are all NaN
        data = data.dropna(how='all').dropna(axis=1, how='all')
        
        if data.empty:
            print(f"After removing NaN values, no data remains for {title}. Please check your input data.")
        else:
            plt.figure(figsize=(20, 12))  # Increased figure size
            sns.heatmap(data, annot=True, cmap='coolwarm', linewidths=.5, fmt=".2f", cbar_kws={'label': 'Average Familiarity Level'})
            plt.title(title)
            plt.xlabel('AI Ethics Principles / Regulations')
            plt.ylabel(ylabel)
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.show()

# Plot heatmap for principles
plot_heatmap(average_familiarity_by_role_principles, 'Average Familiarity with AI Ethics Principles by Role (Combined Responses)', 'Role')

# Plot heatmap for regulations
plot_heatmap(average_familiarity_by_role_regulations, 'Average Familiarity with AI Regulations by Role', 'Role')

# Save the pivoted data to CSV for further review
average_familiarity_by_role_principles.to_csv('familiarity_by_role_and_principle_combined.csv')
average_familiarity_by_role_regulations.to_csv('familiarity_by_role_and_regulation.csv')