In [None]:
# Imports

import os
import pandas as pd
import sys
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from loguru import logger
from pathlib import Path

# Local imports
sys.path.append(r"./utils")
from utils import utils

In [None]:
# Constants

COLAB = False
SAMPLE = False
DRY_RUN = False
DEBUG = True

GROUP_NAME = "whole_dataset"
MODEL_DICT = \
            {"NN": {'epochs': 25} , 
             "LR": {'max_iter': 100}
            }
SELF_REPORTED_COLS = \
           ['age',
            'country_canada', 
            'country_united kingdom',
            'country_united states',
            'database_dating',
            'database_fb',
            'gender',
           ]
DATA_DROP_COLS = \
           ['Unnamed: 0', # index columns
            'Unnamed: 0.1', # TODO: might need to change this for bigger dataset. ,
            # 'pol',  # label column
            # 'gender', # self reported and filtered already
            # 'age', # self-reported  
            # 'country', # self reported and filtered already
            'userid', # index equivalent column 
            'pol_dat_us', # redundant columns with label
            'pol_dat_ca', # redundant columns with label
            'pol_dat_uk', # redundant columns with label
            'pol_fb_us', # redundant columns with label
            # 'database', # filtered already 
            # 'ethnicity.value' # filtered already
            ]
RESULTS_COLS = \
            ["Group Name", 
             "Model", 
             "Feature Set", 
             "Test AUC", 
             "Test Accuracy",
            ]
DATA_DIR = "./data/full/"
RESULTS_DIR = "./results/full/"
RESULTS_STATS_FILENAME = GROUP_NAME + '.csv'
RESULTS_MODEL_FILENAME_PREFIX = GROUP_NAME

if SAMPLE:
  DATA_DIR = "./data/sample/"
  RESULTS_DIR = "./results/sample/"
  DATA_SHAPE_0 = 31742
  DATA_SHAPE_1 = 2092

if DEBUG:
  MODEL_DICT = \
            {"NN": {'epochs': 1} , 
             "LR": {'max_iter': 1}
            }

logger.debug(f"Started the script for {GROUP_NAME}.")

In [None]:
if COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
    
  #TODO: untested for COLAB == True, more might be needed here
  DATA_DIR = "/content/drive/Shareddrives/Facial Recognition/data/"

In [None]:
# Read datasets

folders = os.listdir(DATA_DIR)
dataframes = []
for folder in tqdm(folders):
  logger.debug(f"In folder {folder}.")
  csv_files = os.listdir(DATA_DIR + folder)
  for csv in csv_files:
    if '.csv' in csv:
      logger.debug(DATA_DIR + folder + "/" + csv)
      df = pd.read_csv(DATA_DIR + folder + "/" + csv)
      dataframes.append(df)

data = pd.concat(dataframes, axis = 0)
del df, dataframes
logger.debug(f"Data size is {data.shape}")
if SAMPLE: assert (data.shape[0] == DATA_SHAPE_0) and (data.shape[1] == DATA_SHAPE_1), "Error: data shape is not correct!"

In [None]:
# Clean the data

try:
  data = data.drop(DATA_DROP_COLS, axis=1)
except:
  pass
data = utils.get_clean_data(data)
data_y = data['pol'].replace({"liberal": 1, "conservative": 0})
data = data.drop('pol', axis = 1)
all_features = data.columns

X_train, X_test, y_train, y_test = train_test_split(data, data_y, test_size = 0.2) 
del data, data_y
# TODO: check reproducibility of splits so that additional metrics computed post-hoc
# are consistent. See https://stackoverflow.com/questions/53182821/scikit-learn-train-test-split-not-reproducible


# Define features for the various settings

image_cols = list(map(str, range(1, 2049)))
image_and_self_reported_cols = image_cols + SELF_REPORTED_COLS
image_and_extracted_cols = [x for x in all_features if x not in SELF_REPORTED_COLS]
image_and_self_reported_and_extracted_cols = all_features

data_dict = {
            "Image Features" : image_cols,
            "Image and Self Reported Features" : image_and_self_reported_cols,
            "Image and Extracted Features": image_and_extracted_cols,
            "Image, Self-reported and Extracted Features": image_and_self_reported_and_extracted_cols
            }



In [None]:
# DELETE

results = []
for data_name, data_set_features in tqdm(data_dict.items()):
  for model_name, model_params in MODEL_DICT.items():
    save_model_filepath = Path(RESULTS_DIR \
                     + RESULTS_MODEL_FILENAME_PREFIX \
                     + "_" + model_name \
                     + "_" + data_name.replace(" ","_").replace(",","").replace("-","_") \
                     + '.mdl')
    if save_model_filepath.is_file() or save_model_filepath.is_dir():
      logger.debug("Model already exists. TODO: manually append results.")
    else:
      logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}: model training started.")
      auc, acc, model = utils.fit_and_get_metrics(model_name,
                                                  X_train[data_set_features],
                                                  y_train,
                                                  X_test[data_set_features],
                                                  y_test,
                                                  model_params = model_params,
                                                  dry_run = DRY_RUN)
      utils.save_model(model, model_name, save_model_filepath)
      results.append([GROUP_NAME, model_name, data_name, auc, acc])
      logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}: model training ended. AUC: {auc}, accuracy: {acc}")

In [None]:
# Fit models and log results

results = []
for data_name, data_set_features in tqdm(data_dict.items()):
  for model_name, model_params in MODEL_DICT.items():
    try:
      save_model_filepath = Path(RESULTS_DIR \
                       + RESULTS_MODEL_FILENAME_PREFIX \
                       + "_" + model_name \
                       + "_" + data_name.replace(" ","_").replace(",","").replace("-","_") \
                       + '.mdl')
      if save_model_filepath.is_file() or save_model_filepath.is_dir():
        logger.debug("Model already exists. TODO: manually append results.")
      else:
        logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}: model training started.")
        auc, acc, model = utils.fit_and_get_metrics(model_name,
                                                    X_train[data_set_features],
                                                    y_train,
                                                    X_test[data_set_features],
                                                    y_test,
                                                    model_params = model_params,
                                                    dry_run = DRY_RUN)
        utils.save_model(model, model_name, save_model_filepath)
        results.append([GROUP_NAME, model_name, data_name, auc, acc])
        logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}: model training ended. AUC: {auc}, accuracy: {acc}")
    except Exception as error:
      logger.exception(error)
      logger.error(f"{GROUP_NAME}, {model_name}, {data_name}: Error occured!")

save_results_filepath = Path(RESULTS_DIR + RESULTS_STATS_FILENAME)
if save_results_filepath.is_file() is False:
  utils.save_results(results_array = results, 
                     location = save_results_filepath,
                     columns = RESULTS_COLS)
else:
  logger.debug("IMPORTANT: Results NOT SAVED. Save the list manually!")
  if len(results) > 0:
    print(pd.DataFrame(results,columns = RESULTS_COLS))
logger.debug(f"Script for {GROUP_NAME} finished.")