In [1]:
import os
import random 
import pandas as pd
import warnings
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from tqdm import tqdm
from loguru import logger

# local imports
sys.path.append(r"./utils")
from utils import utils

random.seed(1234)
warnings.filterwarnings('ignore')

In [2]:
COLAB = False
SAMPLE = False
DRY_RUN = False
DATA_DIR = "./data/full/"
GROUP_NAME = "whole_dataset"
MODEL_LIST = ["NN", "LR"]
RESULTS_DIR = "./results/"
RESULTS_STATS_FILENAME = GROUP_NAME + '.csv'
RESULTS_MODEL_FILENAME_PREFIX = GROUP_NAME

logger.debug(f"Started the script for{GROUP_NAME}.")

if SAMPLE:
  DATA_DIR = "./data/sample/"

DATA_DROP_COLS = \
           ['Unnamed: 0', # index columns
           'Unnamed: 0.1', # TODO: might need to change this for bigger dataset. ,
              # 'pol',  # label column
              # 'gender', # self reported and filtered already
              # 'age', # self-reported  
              # 'country', # self reported and filtered already
              'userid', # index equivalent column 
              'pol_dat_us', # redundant columns with label
              'pol_dat_ca', # redundant columns with label
              'pol_dat_uk', # redundant columns with label
              'pol_fb_us', # redundant columns with label
              # 'database', # filtered already 
              # 'ethnicity.value' # filtered already
              ]

2021-08-14 22:23:23.414 | DEBUG    | __main__:<module>:11 - Started the script forwhole_dataset.


In [3]:
if COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
    
  #TODO: untested for COLAB == True, more might be needed here
  DATA_DIR = "/content/drive/Shareddrives/Facial Recognition/data/"

In [4]:
# Read datasets

folders = os.listdir(DATA_DIR)
dataframes = []
for folder in tqdm(folders):
  logger.debug(f"In folder {folder}.")
  csv_files = os.listdir(DATA_DIR + folder)
  for csv in csv_files:
    if '.csv' in csv:
      logger.debug(DATA_DIR + folder + "/" + csv)
      df = pd.read_csv(DATA_DIR + folder + "/" + csv)
      dataframes.append(df)

data = pd.concat(dataframes, axis = 0)
del df, dataframes
print(data.shape)

  0%|          | 0/9 [00:00<?, ?it/s]2021-08-14 22:23:23.425 | DEBUG    | __main__:<module>:6 - In folder NO FILES.
2021-08-14 22:23:23.426 | DEBUG    | __main__:<module>:6 - In folder US_1_FB.
2021-08-14 22:23:23.426 | DEBUG    | __main__:<module>:10 - ./data/full/US_1_FB/segment_united states_1_fb_white.csv
2021-08-14 22:23:37.866 | DEBUG    | __main__:<module>:10 - ./data/full/US_1_FB/segment_united states_1_fb_asian.csv
2021-08-14 22:23:39.492 | DEBUG    | __main__:<module>:10 - ./data/full/US_1_FB/segment_united states_1_fb_india.csv
2021-08-14 22:23:40.457 | DEBUG    | __main__:<module>:10 - ./data/full/US_1_FB/segment_united states_1_fb_black.csv
 22%|██▏       | 2/9 [00:21<01:15, 10.84s/it]2021-08-14 22:23:45.107 | DEBUG    | __main__:<module>:6 - In folder UK_1_dating.
2021-08-14 22:23:45.108 | DEBUG    | __main__:<module>:10 - ./data/full/UK_1_dating/segment_united kingdom_1_dating_white.csv
2021-08-14 22:23:50.969 | DEBUG    | __main__:<module>:10 - ./data/full/UK_1_dating/s

(1085179, 2091)


In [None]:
# Fit models and log results

# Clean the data
try:
  data = data.drop(DATA_DROP_COLS, axis=1)
except:
  pass
data = utils.get_clean_data(data)

# Define features for the various settings
image_cols = list(map(str, range(1, 2049))) + ['pol']
image_and_self_reported_cols = image_cols + ['age']
image_and_extracted_cols = [x for x in data.columns if x != "age"]
image_and_self_reported_and_extracted_cols = data.columns

data_dict = {
"Image Features" : image_cols,
"Image and Self Reported Features" : image_and_self_reported_cols,
"Image and Extracted Features": image_and_extracted_cols,
"Image, Self-reported and Extracted Features": image_and_self_reported_and_extracted_cols
}

# Fit models
results = []
for data_name, data_set_features in tqdm(data_dict.items()):
  for model_name in MODEL_LIST:
    try:
      logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}: model training started.")
      auc, acc, model = utils.fit_and_get_metrics(data[data_set_features], model_name, dry_run = DRY_RUN)
      utils.save_model(model, model_name, RESULTS_DIR \
                       + RESULTS_MODEL_FILENAME_PREFIX \
                       + "_" + model_name \
                       + "_" + data_name.replace(" ","_").replace(",","").replace("-","_") \
                       + '.mdl')
      results.append([GROUP_NAME, model_name, data_name, auc, acc])
      logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}: model training ended. AUC: {auc}, accuracy: {acc}")
    except:
      logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}: Error occured!")
  
utils.save_results(results_array = results, location = RESULTS_DIR + RESULTS_STATS_FILENAME)
logger.debug(f"Script for {GROUP_NAME} finished.")

  0%|          | 0/4 [00:00<?, ?it/s]2021-08-14 22:29:07.156 | DEBUG    | __main__:<module>:28 - whole_dataset, NN, Image Features: model training started.
