# Purpose:

This notebooks takes you through a tour of all the CSVs generated during the dataset audit process.

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import os

In [2]:
list_census=['df_insightface_stats.csv',
'df_audit_age_gender_dex.csv',
'df_nsfw.csv',
'df_acc_classwise_resnet50.csv',
'df_acc_classwise_NasNet_mobile.csv',
'df_imagenet_names_umap.csv',
'df_census_imagenet_61.csv',
'df_census_columns_interpretation.csv',
'df_hand_survey.csv' ]

list_tiny=['df_classes_tiny_images_3.csv']

list_dog=['df_dog_analysis.csv',
'dogs_imagenet.csv',
'df_dog_groups.csv']

list_misc=['df_imagenet_stats.csv',
'df_imagenet_census.csv',
'df_imagenet_classes.csv',
'df_imagenet_renamed.csv',
'df_imagenet_names_umap.csv',
'df_comb_umap.csv']

In [3]:
csv_dir='..\DATA\csv'

# 1: Census files

'df_insightface_stats.csv',
'df_audit_age_gender_dex.csv',
'df_nsfw.csv',
'df_acc_classwise_resnet50.csv',
'df_acc_classwise_NasNet_mobile.csv',
'df_imagenet_names_umap.csv',
'df_census_imagenet_61.csv',
'df_census_columns_interpretation.csv',
'df_hand_survey.csv' 

(The meaning of the parameters in the colunns of the dataframes are all explained in 'df_census_columns_interpretation.csv')

In [4]:
# 1: 24 classwise statistical parameters obtained by running the InsightFace model on the ImageNet dataset
df=pd.read_csv(os.path.join(csv_dir,'df_insightface_stats.csv'))
print(df.shape)
df.head(2)

(1000, 30)


Unnamed: 0,class_number,wordnet_id,label,uri,n_train,n_val,n_humans_train,n_humans_val,Nunique_humans_train,gender_skewness_train,...,Nunique_humans_val,gender_skewness_val,n_women_val,mean_age_women_val,std_age_women_val,n_men_val,mean_age_men_val,std_age_men_val,age_min,age_max
0,0,n01440764,"tench, Tinca tinca",http://wordnet-rdf.princeton.edu/wn30/01440764-n,1300,50,779,22,794,-2.187542,...,22,-1.649916,4.0,24.25,6.849574,18.0,38.611111,8.513926,18.0,59.0
1,1,n01443537,"goldfish, Carassius auratus",http://wordnet-rdf.princeton.edu/wn30/01443537-n,1300,50,8,1,8,-0.516398,...,1,0.0,,,,1.0,24.0,,14.0,41.0


In [5]:
# 2: 11 classwise (ordered by the wordnet-id) statistical parameters obtained from the json files (of the DEX paper)
# Dulhanty C, Wong A. Auditing imagenet: Towards a model-driven framework for annotating demographic attributes of large-scale image datasets. arXiv preprint arXiv:1905.01347. 2019 May 3.
df=pd.read_csv(os.path.join(csv_dir,'df_audit_age_gender_dex.csv'))
print(df.shape)
df.head(2)

(1000, 12)


Unnamed: 0,wordnet_id,n_faces_raw_audit,n_faceswithages_audit,n_train_audit,mean_age_audit,std_age_audit,skew_age_audit,n_faceswithages_audit_2,n_train_audit_2,mean_gender_audit,std_gender_audit,skew_gender_audit
0,n01440764,2824,789,1300,35.576593,13.375839,0.09684,789,1300,0.91187,0.241729,-2.977989
1,n01443537,3307,10,1300,29.425552,15.6903,0.962499,10,1300,0.573201,0.364169,-0.208323


In [6]:
# 3: The mean and std of the NSFW scores of the train and val images arranged per-class. (Unnamed: 0: WordNetID of the class)
df=pd.read_csv(os.path.join(csv_dir,'df_nsfw.csv'))
print(df.shape)
df.head(2)

(1000, 5)


Unnamed: 0.1,Unnamed: 0,mean_nsfw_train,mean_nsfw_val,std_nsfw_train,std_nsfw_val
0,n02025239,0.011244,0.007649,0.042218,0.021405
1,n03832673,0.061136,0.042006,0.161923,0.1267


In [7]:
# 4: Classwise accuracy metrics obtained by running the ResNet50 model on ImageNet train and Val sets
df=pd.read_csv(os.path.join(csv_dir,'df_acc_classwise_resnet50.csv'))
print(df.shape)
df.head(2)

(1000, 7)


Unnamed: 0.1,Unnamed: 0,mean_top1_train,mean_top1_val,mean_top5_train,mean_top5_val,pred_train,pred_val
0,n02025239,0.940769,0.96,0.983077,0.98,[139 139 139 ... 139 139 139],[140 139 139 139 139 139 139 139 139 139 139 1...
1,n03832673,0.53,0.12,0.870769,0.78,[620 620 620 ... 284 532 681],[592 620 620 508 620 633 982 681 662 620 620 6...


In [8]:
# 5: Classwise accuracy metrics (& the imagelevel preds) obtained by running the NasNet model on ImageNet train and Val sets
df=pd.read_csv(os.path.join(csv_dir,'df_acc_classwise_NasNet_mobile.csv'))
print(df.shape)
df.head(2)

(1000, 7)


Unnamed: 0.1,Unnamed: 0,mean_top1_train,mean_top1_val,mean_top5_train,mean_top5_val,pred_train,pred_val
0,n02025239,0.946154,0.98,0.986923,0.98,[139 139 139 ... 139 139 139],[139 139 139 139 139 139 139 139 139 139 139 1...
1,n03832673,0.559231,0.38,0.901538,0.8,[681 620 681 ... 332 681 681],[662 620 681 508 620 620 534 681 662 681 681 6...


In [9]:
# 6: DF with 2D UMAP embeddings of the Glove vectors of the classes of the ImageNet dataset
df=pd.read_csv(os.path.join(csv_dir,'df_imagenet_names_umap.csv'))
print(df.shape)
df.head(2)

(1000, 5)


Unnamed: 0,class_number,wordnet_id,label,umap_x,umap_y
0,0,n01440764,"tench, Tinca tinca",1.277791,8.856661
1,1,n01443537,"goldfish, Carassius auratus",1.314007,8.63837


In [10]:
# 7: The MAIN census dataframe covering class-wise metrics across 61 parameters, all of which are explained in df_census_columns_interpretation.csv
df_census=pd.read_csv(os.path.join(csv_dir,'df_census_imagenet_61.csv'))
print(df_census.shape)
df_census.head(2)

(1000, 61)


Unnamed: 0,class_number,wordnet_id,label,uri,n_train,n_val,n_humans_train,n_humans_val,Nunique_humans_train,gender_skewness_train,...,mean_top1_train_nnm,mean_top1_val_nnm,mean_top5_train_nnm,mean_top5_val_nnm,pred_train_nnm,pred_val_nnm,class_number_umap,label_renamed_glove,umap_x,umap_y
0,0,n01440764,"tench, Tinca tinca",http://wordnet-rdf.princeton.edu/wn30/01440764-n,1300,50,779,22,794,-2.187542,...,0.922308,0.86,0.982308,0.92,[ 0 389 0 ... 0 391 0],[ 0 0 0 0 0 30 0 391 0 0 0 ...,0,"tench, Tinca tinca",1.277791,8.856661
1,1,n01443537,"goldfish, Carassius auratus",http://wordnet-rdf.princeton.edu/wn30/01443537-n,1300,50,8,1,8,-0.516398,...,0.943077,0.78,0.976154,0.92,[392 1 1 ... 1 1 1],[ 1 983 1 1 1 1 1 328 1 1 88 ...,1,"goldfish, Carassius auratus",1.314007,8.63837


In [11]:
# 8: The interpretations of the 61 metrics of the the census dataframe above!
df=pd.read_csv(os.path.join(csv_dir,'df_census_columns_interpretation.csv'))
print(df.shape)
df.head(2)

(61, 2)


Unnamed: 0,col_names_census,Interpretation
0,class_number,ImageNet class number (0-990)
1,wordnet_id,To uniquely identify a synset ImageNet uses Wo...


In [12]:
# Demonstrating the match between the 61 columns of df_census and the rows of df_census_columns_interpretation
(df.col_names_census==df_census.columns).mean()

1.0

In [13]:
# 9: Dataframe contaimning the details of the 61 images unearthed via hand survey
df=pd.read_csv(os.path.join(csv_dir,'df_hand_survey.csv')) 
print(df.shape)
df.head(2)

(61, 3)


Unnamed: 0,wordnet_id,category,file_names
0,n03710637,beach_voyeur,ILSVRC2012_val_00021081.JPEG
1,n02837789,beach_voyeur,n02837789_11383.JPEG


# 2: Tiny images

In [14]:
# Dataframe containing the class_ind, class_name (wordnet noun) and n_images
df=pd.read_csv(os.path.join(csv_dir,'df_classes_tiny_images_3.csv'))
print(df.shape)
df.head(2)

(75846, 3)


Unnamed: 0,class_ind,class_name,n_images
0,0,a-bomb,2426
1,1,a-horizon,1866


# 3: Dog analysis

In [15]:
# Daframe containing breed, gender_ratio and survey result from the paper Breed differences in canine aggression'
df=pd.read_csv(os.path.join(csv_dir,'df_dog_analysis.csv'))
print(df.shape)
df.head(2)

(7, 4)


Unnamed: 0,Breed,label,gender_ratio,Dog_breedClub
0,EnglishSpringerSpaniel,Welsh springer spaniel,2.958333,0.79
1,GoldenRetriever,golden retriever,1.318182,0.49


In [16]:
# Dog breeds for the 120 dogs in the ImageNet dataset
df=pd.read_csv(os.path.join(csv_dir,'dogs_imagenet.csv'))
print(df.shape)
df.head(2)

(120, 2)


Unnamed: 0,ImageNet_Synset,Dog_breed
0,n02110627,Affenpinscher
1,n02088094,Afghan hound


In [17]:
# Dog breed to AKC dog group for 206 breeds of dogs
df=pd.read_csv(os.path.join(csv_dir,'df_dog_groups.csv'))
print(df.shape)
df.head(2)

(206, 3)


Unnamed: 0.1,Unnamed: 0,breed,dog_group
0,0,Blue Lacy,Herding
1,1,Queensland Heeler,Herding


# 4: Pertaining to the semantics of the imagenet classes:

In [18]:
# ImageNet classes hand-renamed for 8 classes that were repeats or didn't have Glove embeddings 
# (The remaining 992 classes remian the same)
df=pd.read_csv(os.path.join(csv_dir,'df_imagenet_renamed.csv'))
print(df.shape,(df.label!=df_census.label).sum())
df.head(2)

(1000, 3) 8


Unnamed: 0,class_number,label,wordnet_id
0,0,"tench, Tinca tinca",n01440764
1,1,"goldfish, Carassius auratus",n01443537


In [19]:
# Dataframe with re-named names and the associated 2D UMAP embeddings of the 300-D Glove vectors
df=pd.read_csv(os.path.join(csv_dir,'df_imagenet_names_umap.csv'))
print(df.shape,(df.label!=df_census.label).sum())
df.head(2)

(1000, 5) 8


Unnamed: 0,class_number,wordnet_id,label,umap_x,umap_y
0,0,n01440764,"tench, Tinca tinca",1.277791,8.856661
1,1,n01443537,"goldfish, Carassius auratus",1.314007,8.63837


In [20]:
# A temp datatframe with re-named labels with UMAP locations, class-wise accuracies and NSFW scores
df=pd.read_csv(os.path.join(csv_dir,'df_comb_umap.csv'))
print(df.shape,(df.label!=df_census.label).sum())
df.head(2)

(1000, 9) 8


Unnamed: 0,class_number,label,wordnet_id,umap_x,umap_y,mean_top1_train_resnet50,mean_top1_val_resnet50,mean_nsfw_train,mean_nsfw_val
0,0,"tench, Tinca tinca",n01440764,1.916009,9.327968,0.906923,0.82,0.073776,0.124612
1,1,"goldfish, Carassius auratus",n01443537,1.882166,8.996173,0.932308,0.8,0.030087,0.052769
