In [1]:
import os
import random 
import pandas as pd
import warnings
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from tqdm import tqdm

# local imports
sys.path.append(r"./utils")
from utils import utils

random.seed(1234)
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
COLAB = False
DEBUG = True
DATA_DIR = "./data/full/"
results_file_loc = "results/ethnicity comparison/ethinicity_groups_results.csv"

if DEBUG:
    DATA_DIR = "./data/sample/"

DATA_DROP_COLS = \
           ['Unnamed: 0', # index columns
           'Unnamed: 0.1', # TODO might need to change this for bigger dataset. 
            'gender', # self reported and filtered already
            'country', # self reported and filtered already
            'userid', # index equivalent column 
            'pol_dat_us', # redundant columns with label
            'pol_dat_ca', # redundant columns with label
            'pol_dat_uk', # redundant columns with label
            'pol_fb_us', # redundant columns with label
            'database', # filtered already 
            'ethnicity.value' # filtered already
            ]

In [3]:
if COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
    
  #TODO: untested for COLAB == True, more might be needed here
  DATA_DIR = "/content/drive/Shareddrives/Facial Recognition/data/"

In [4]:
# List of data file paths

folders = os.listdir(DATA_DIR)
dataset_paths = []
for folder in tqdm(folders):
  csv_files = os.listdir(DATA_DIR + folder)
  for csv in csv_files:
    if '.csv' in csv:
      if DEBUG: print(DATA_DIR + folder + "/" + csv)
      dataset_paths.append(DATA_DIR + folder + "/" + csv)

100%|██████████| 9/9 [00:00<00:00, 4039.46it/s]

./data/sample/UK_0_dating/segment_united kingdom_0_dating_black.csv
./data/sample/UK_0_dating/segment_united kingdom_0_dating_asian.csv
./data/sample/UK_0_dating/segment_united kingdom_0_dating_india.csv
./data/sample/UK_0_dating/segment_united kingdom_0_dating_white.csv
./data/sample/Canada_1_dating/segment_canada_1_dating_asian.csv
./data/sample/Canada_1_dating/segment_canada_1_dating_india.csv
./data/sample/Canada_1_dating/segment_canada_1_dating_white.csv
./data/sample/Canada_1_dating/segment_canada_1_dating_black.csv
./data/sample/US_0_dating/segment_united states_0_dating_india.csv
./data/sample/US_0_dating/segment_united states_0_dating_asian.csv
./data/sample/US_0_dating/segment_united states_0_dating_white.csv
./data/sample/US_0_dating/segment_united states_0_dating_black.csv
./data/sample/Canada_0_dating/segment_canada_0_dating_asian.csv
./data/sample/Canada_0_dating/segment_canada_0_dating_india.csv
./data/sample/Canada_0_dating/segment_canada_0_dating_white.csv
./data/sampl




In [6]:
results = []
problematic_data = []
for c, dataset_path in tqdm(enumerate(dataset_paths)):
  data = utils.get_clean_data(dataset_path, DATA_DROP_COLS)

  # which group it is being processed on 
  group_name = utils.get_dataframe_name(dataset_path)
  
  only_image_cols = list(map(str, range(1,2049))) + ['pol']
  image_self_reported = only_image_cols + ['age']

  
  try:
    dataframe_dict = {
    "Only Image Features" : data[only_image_cols],
    "Image and Self Reported Features" : data[image_self_reported],
    "Image and extracted Features":data.drop("age",axis = 1),
    "Image, Self-reported and Extracted Features": data
  }
  
    for data_name, data_set in dataframe_dict.items():
      for model_name in ["NN","LR"]:
        auc, acc = utils.fit_and_get_metrics(data_set, model_name)
        results.append([group_name,model_name,data_name,auc,acc])
  except:
    problematic_data.append([dataset_path,group_name]) # DEBUG to check if the dataframe is off somewhere. 
    

results_df = pd.DataFrame(results, columns = ["Group_Name","Model","feature_set","Test AUC","Test ACC"])

results_df.to_csv(results_file_loc, index=False)
print(" Segment Results Saved !!")

32it [10:32, 19.75s/it]

 Segment Results Saved !!





In [14]:
df = pd.read_csv(results_file_loc)

df.head()

Unnamed: 0,Group_Name,Model,feature_set,Test AUC,Test ACC
0,united kingdom_0_dating_black,NN,Only Image Features,64.25,62.0
1,united kingdom_0_dating_black,LR,Only Image Features,61.62,58.0
2,united kingdom_0_dating_black,NN,Image and Self Reported Features,65.88,68.0
3,united kingdom_0_dating_black,LR,Image and Self Reported Features,63.46,64.0
4,united kingdom_0_dating_black,NN,Image and extracted Features,55.92,61.5
