In [1]:
import os
import random 
import pandas as pd
import warnings
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from tqdm import tqdm
from loguru import logger

# local imports
sys.path.append(r"./utils")
from utils import utils

random.seed(1234)
warnings.filterwarnings('ignore')

In [2]:
COLAB = False
SAMPLE = True
DRY_RUN = False
DATA_DIR = "./data/full/"
GROUP_NAME = "whole_dataset"
MODEL_LIST = ["NN", "LR"]
RESULTS_DIR = "./results/"
RESULTS_STATS_FILENAME = GROUP_NAME + '.csv'
RESULTS_MODEL_FILENAME_PREFIX = GROUP_NAME
logger.debug(f"Script for {GROUP_NAME}")


if SAMPLE:
  DATA_DIR = "./data/sample/"

DATA_DROP_COLS = \
           ['Unnamed: 0', # index columns
           'Unnamed: 0.1', # TODO: might need to change this for bigger dataset. ,
              # 'pol',  # label column
              # 'gender', # self reported and filtered already
              # 'age', # self-reported  
              # 'country', # self reported and filtered already
              'userid', # index equivalent column 
              'pol_dat_us', # redundant columns with label
              'pol_dat_ca', # redundant columns with label
              'pol_dat_uk', # redundant columns with label
              'pol_fb_us', # redundant columns with label
              # 'database', # filtered already 
              # 'ethnicity.value' # filtered already
              ]

2021-08-14 21:32:52.917 | DEBUG    | __main__:<module>:10 - Script for whole_dataset


In [3]:
if COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
    
  #TODO: untested for COLAB == True, more might be needed here
  DATA_DIR = "/content/drive/Shareddrives/Facial Recognition/data/"

In [4]:
# Read datasets

folders = os.listdir(DATA_DIR)
dataframes = []
for folder in tqdm(folders):
  logger.debug(f"In folder {folder}.")
  csv_files = os.listdir(DATA_DIR + folder)
  for csv in csv_files:
    if '.csv' in csv:
      logger.debug(DATA_DIR + folder + "/" + csv)
      df = pd.read_csv(DATA_DIR + folder + "/" + csv)
      dataframes.append(df)

data = pd.concat(dataframes, axis = 0)
print(data.shape)

  0%|          | 0/9 [00:00<?, ?it/s]2021-08-14 21:32:52.934 | DEBUG    | __main__:<module>:6 - In folder NO FILES.
2021-08-14 21:32:52.935 | DEBUG    | __main__:<module>:6 - In folder US_1_FB.
2021-08-14 21:32:52.935 | DEBUG    | __main__:<module>:10 - ./data/sample/US_1_FB/segment_united states_1_fb_white.csv
2021-08-14 21:32:53.335 | DEBUG    | __main__:<module>:10 - ./data/sample/US_1_FB/segment_united states_1_fb_asian.csv
2021-08-14 21:32:53.704 | DEBUG    | __main__:<module>:10 - ./data/sample/US_1_FB/segment_united states_1_fb_india.csv
2021-08-14 21:32:54.084 | DEBUG    | __main__:<module>:10 - ./data/sample/US_1_FB/segment_united states_1_fb_black.csv
 22%|██▏       | 2/9 [00:01<00:05,  1.32it/s]2021-08-14 21:32:54.452 | DEBUG    | __main__:<module>:6 - In folder UK_1_dating.
2021-08-14 21:32:54.453 | DEBUG    | __main__:<module>:10 - ./data/sample/UK_1_dating/segment_united kingdom_1_dating_white.csv
2021-08-14 21:32:54.827 | DEBUG    | __main__:<module>:10 - ./data/sample/U

(31742, 2092)


In [None]:
# Fit models and log results

# Define features
only_image_cols = list(map(str, range(1, 2049))) + ['pol']
image_self_reported = only_image_cols + ['age']

# Clean the data
try:
  data = data.drop(DATA_DROP_COLS, axis=1)
except:
  pass
data = utils.get_clean_data(data)

# Fit models
results = []
try:
  dataframe_dict = {
  "Image Features" : data[only_image_cols],
  "Image and Self Reported Features" : data[image_self_reported],
  "Image and Extracted Features": data.drop("age",axis = 1),
  "Image, Self-reported and Extracted Features": data
  }

  for data_name, data_set in tqdm(dataframe_dict.items()):
    for model_name in MODEL_LIST:
      auc, acc, model = utils.fit_and_get_metrics(data_set, model_name, dry_run = DRY_RUN)
      utils.save_model(model, model_name, RESULTS_DIR \
                       + RESULTS_MODEL_FILENAME_PREFIX \
                       + "_" + model_name \
                       + "_" + data_name.replace(" ","_").replace(",","").replace("-","_") \
                       + '.mdl')
      results.append([GROUP_NAME, model_name, data_name, auc, acc])
      logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}, {auc}, {acc}")
except:
  pass
  
utils.save_results(results_array = results, location = RESULTS_DIR + RESULTS_STATS_FILENAME)
logger.debug(f"Script for {GROUP_NAME} finished.")

  0%|          | 0/4 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: ./results/whole_dataset_NN_Image_Features.mdl/assets


2021-08-14 21:33:53.620 | DEBUG    | __main__:<module>:33 - whole_dataset, NN, Image Features, 68.55, 68.5
