In [1]:
import os
import random 
import pandas as pd
import warnings
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from tqdm import tqdm

# local imports
sys.path.append(r"./utils")
from utils import utils

random.seed(1234)
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
COLAB = False
DEBUG = True
DATA_DIR = "./data/full/"
results_file_loc = "results/Model results/whole_dataset.csv"

if DEBUG:
    DATA_DIR = "./data/sample/"

DATA_DROP_COLS = \
           ['Unnamed: 0', # index columns
           'Unnamed: 0.1', # TODO might need to change this for bigger dataset. ,
              # 'pol',  # label column
              # 'gender', # self reported and filtered already
              # 'age', # self-reported  
              # 'country', # self reported and filtered already
              'userid', # index equivalent column 
              'pol_dat_us', # redundant columns with label
              'pol_dat_ca', # redundant columns with label
              'pol_dat_uk', # redundant columns with label
              'pol_fb_us', # redundant columns with label
              # 'database', # filtered already 
              # 'ethnicity.value' # filtered already
              ]

In [3]:
if COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
    
  #TODO: untested for COLAB == True, more might be needed here
  DATA_DIR = "/content/drive/Shareddrives/Facial Recognition/data/"

In [4]:
# List of data file paths

folders = os.listdir(DATA_DIR)
dataset_paths = []
dataframes = []
for folder in tqdm(folders):
  csv_files = os.listdir(DATA_DIR + folder)
  for csv in csv_files:
    if '.csv' in csv:
      if DEBUG: print(DATA_DIR + folder + "/" + csv)
      # dataset_paths.append(DATA_DIR + folder + "/" + csv)

      df = pd.read_csv(DATA_DIR + folder + "/" + csv)
      dataframes.append(df)

data = pd.concat(dataframes, axis = 0)
print(data.shape)

  0%|          | 0/9 [00:00<?, ?it/s]

./data/sample/UK_0_dating/segment_united kingdom_0_dating_black.csv
./data/sample/UK_0_dating/segment_united kingdom_0_dating_asian.csv
./data/sample/UK_0_dating/segment_united kingdom_0_dating_india.csv
./data/sample/UK_0_dating/segment_united kingdom_0_dating_white.csv


 11%|█         | 1/9 [00:01<00:14,  1.87s/it]

./data/sample/Canada_1_dating/segment_canada_1_dating_asian.csv
./data/sample/Canada_1_dating/segment_canada_1_dating_india.csv
./data/sample/Canada_1_dating/segment_canada_1_dating_white.csv
./data/sample/Canada_1_dating/segment_canada_1_dating_black.csv


 22%|██▏       | 2/9 [00:03<00:13,  1.89s/it]

./data/sample/US_0_dating/segment_united states_0_dating_india.csv
./data/sample/US_0_dating/segment_united states_0_dating_asian.csv
./data/sample/US_0_dating/segment_united states_0_dating_white.csv
./data/sample/US_0_dating/segment_united states_0_dating_black.csv


 33%|███▎      | 3/9 [00:05<00:11,  1.97s/it]

./data/sample/Canada_0_dating/segment_canada_0_dating_asian.csv
./data/sample/Canada_0_dating/segment_canada_0_dating_india.csv
./data/sample/Canada_0_dating/segment_canada_0_dating_white.csv
./data/sample/Canada_0_dating/segment_canada_0_dating_black.csv


 56%|█████▌    | 5/9 [00:07<00:05,  1.38s/it]

./data/sample/UK_1_dating/segment_united kingdom_1_dating_black.csv
./data/sample/UK_1_dating/segment_united kingdom_1_dating_asian.csv
./data/sample/UK_1_dating/segment_united kingdom_1_dating_india.csv
./data/sample/UK_1_dating/segment_united kingdom_1_dating_white.csv


 67%|██████▋   | 6/9 [00:09<00:04,  1.49s/it]

./data/sample/US_1_FB/segment_united states_1_fb_white.csv
./data/sample/US_1_FB/segment_united states_1_fb_india.csv
./data/sample/US_1_FB/segment_united states_1_fb_asian.csv
./data/sample/US_1_FB/segment_united states_1_fb_black.csv


 78%|███████▊  | 7/9 [00:11<00:03,  1.67s/it]

./data/sample/US_1_dating/segment_united states_1_dating_india.csv
./data/sample/US_1_dating/segment_united states_1_dating_asian.csv
./data/sample/US_1_dating/segment_united states_1_dating_white.csv
./data/sample/US_1_dating/segment_united states_1_dating_black.csv


 89%|████████▉ | 8/9 [00:13<00:01,  1.80s/it]

./data/sample/US_0_FB/segment_united states_0_fb_india.csv
./data/sample/US_0_FB/segment_united states_0_fb_asian.csv
./data/sample/US_0_FB/segment_united states_0_fb_white.csv
./data/sample/US_0_FB/segment_united states_0_fb_black.csv


100%|██████████| 9/9 [00:15<00:00,  1.75s/it]


(31742, 2092)


In [5]:
# List of data file paths
results = []
problematic_data = []

group_name = "Whole_dataset"

only_image_cols = list(map(str, range(1,2049))) + ['pol']
image_self_reported = only_image_cols + ['age']

data = data.drop(DATA_DROP_COLS,axis=1)

data = utils.get_clean_data(data)

try:
  dataframe_dict = {
  "Only Image Features" : data[only_image_cols],
  "Image and Self Reported Features" : data[image_self_reported],
  "Image and extracted Features":data.drop("age",axis = 1),
  "Image, Self-reported and Extracted Features": data
}

  for data_name, data_set in dataframe_dict.items():
    for model_name in ["NN","LR"]:
      auc, acc = utils.fit_and_get_metrics(data_set, model_name)
      results.append([group_name,model_name,data_name,auc,acc])
except:
  pass
  
utils.save_results(results_array=results, location=results_file_loc)
