In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

from torchvision import datasets
import os
from itertools import chain, product
from development import data_handler

This Notebook will save two Excel files with the following content:
1. Question meta data (question code, img_idx, img_label, model, xai, outcome)
2. The transformed long-format version of the SoSci data specified in DATA_PATH

# Data preparation

## Read questionnaire information from picking procedure

In [2]:
questionnaires = data_handler.get_questionnaires("../data/question_generation/questionaires_shuffled.pickle")
labels = data_handler.get_labels("../data/imagenet_class_index.json")
# nootebook changes from .\data/imagenet_class_index.json to '.\\data/imagenet_class_index.json'

Using downloaded and verified file: .\../data/imagenet_class_index.json


In [3]:
def get_label_from_img_idx(img_idx, testset_path, labels):    
    img_folder = datasets.ImageFolder(root=testset_path)
    img_path = img_folder.imgs[img_idx][0]
    img_name = img_path.split(os.sep)[-1]
    # extract correct class
    class_idx_true_str = img_path.split(os.sep)[-2]
    img_label_true = labels[class_idx_true_str][1]
    return img_label_true

In [5]:
# enrich questionnaire data with image label names
questionnaires_2 = []
for questionnaire in tqdm(questionnaires):
    questionnaire_2 = []
    for question in questionnaire:
        label = get_label_from_img_idx(question[0], "../data/imagenetv2-matched-frequency-format-val", labels)
        question_labled = (label, ) + question        
        questionnaire_2.append(question_labled)
    questionnaires_2.append(questionnaire_2) 

100%|██████████| 12/12 [00:41<00:00,  3.44s/it]


## Load question codes used in SoSci

In [8]:
codes_list = []
for i in range(1, 12+1):
    codes = pd.read_csv(f"questionnaires_shuffle_order/questionnaire_{i}.txt", sep=";", names=[0, 1])[1]
    codes = codes.str.extract("(\w\d{3})")
    codes = list(codes[0])
    codes_list.append(codes)

FileNotFoundError: [Errno 2] No such file or directory: '../questionnaire_forms_conducted_survey/questionnaire_1.txt'

## Create questions meta data df

In [8]:
df_quest_meta = pd.DataFrame(list(chain(*questionnaires_2)))
df_quest_meta[5] = list(chain(*codes_list))
df_quest_meta.columns = ["label", "img_idx", "model", "method", "is_pred_correct", "question_code"]

In [9]:
df_quest_meta

Unnamed: 0,label,img_idx,model,method,is_pred_correct,question_code
0,custard_apple,9521,vgg,LRP,True,B108
1,Boston_bull,1073,vgg,SHAP,True,B111
2,toilet_tissue,9992,alex,IntegratedGradients,False,B118
3,gas_pump,5253,vgg,ConfidenceScores,False,B123
4,mailbag,5976,vgg,IntegratedGradients,False,B124
...,...,...,...,...,...,...
283,custard_apple,9521,vgg,gradCAM,True,C307
284,coral_reef,9710,alex,gradCAM,True,C302
285,black_grouse,7798,alex,LRP,False,C319
286,steel_arch_bridge,8034,alex,SHAP,True,C317


## Load and transform questionnaire data

In [24]:
DATA_PATH = "data/data_tu-helpfulness-of-xai_2022-06-29_10-48.xlsx"
ANSER_CODES_PATH = "data/values_tu-helpfulness-of-xai_2022-06-28_11-48.csv"
df = pd.read_excel(DATA_PATH)
df["DE09"].loc[0] = "ML Experience Usefulness"
df_answer_codes = pd.read_csv(ANSER_CODES_PATH, sep='\t', encoding='utf-16').set_index(["VAR", "RESPONSE"])

In [27]:
def map_answer_codes_to_textual(s, df_answer_codes):
    def map_(s_name, e, df_answer_codes):
        try:
            return df_answer_codes.loc[s_name, e]["MEANING"]
        except:
            
            return e
    try:
        return s.apply(lambda e: map_(s.name, e, df_answer_codes))
    except:
        print("Error in mapping column", s.name)
        return s

In [28]:
def convert_q_data_from_wide_to_long(df):
    column_names_demographic = list(df[df.columns[df.columns.get_loc("DE02"):df.columns.get_loc("FB01_01")+1]].loc[0])
    #delete column descriptions
    df = df.drop(0)    
    df_long = pd.melt(df, id_vars="CASE", value_vars=df.columns.values[6:294]).dropna()
    df_long.columns = ["case", "question_code", "response"]
    df_long = df_long.sort_values("case", )
    # map response 1(Yes)/2(No) values to True/False
    df_long["response"] = df_long["response"].apply(lambda x: True if x==1 else False)
    df_long = df_long.reset_index(drop=True)
    
    df_demo = df[list(df.columns[df.columns.get_loc("DE02"):df.columns.get_loc("FB01_01")+1]) + ["QUESTNNR", "TIME003"]]
    df_demo = df_demo.apply(lambda s: map_answer_codes_to_textual(s, df_answer_codes))
    df_demo.columns = column_names_demographic + ["QUESTNNR", "TIME003"]
    num_questions = 24
    df_demo = df_demo.apply(lambda s: s.repeat(num_questions)).reset_index(drop=True) 
    df_long = pd.concat([df_long["case"], df_demo, df_long[df_long.columns[1:]]], axis=1)
    
    return df_long

In [29]:
df_long = convert_q_data_from_wide_to_long(df)

In [30]:
df_merged = df_long.merge(right=df_quest_meta)
df_merged = df_merged.sort_values("case")

## Save question meta data & prepared data to file

In [31]:
df_quest_meta.to_excel("data/question_meta_data.xlsx")

In [32]:
data_format = DATA_PATH.split(".")[-1]
filenname = f"{DATA_PATH.split(os.sep)[-1].split('.')[0]}_PREPARED.{data_format}"
OUTPUT_PATH = f"data/{filenname}"
df_merged.to_excel(OUTPUT_PATH)