In [None]:
import os
import random 
import pandas as pd
import warnings
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from loguru import logger

# local imports
sys.path.append(r"./utils")
from utils import utils

random.seed(1234)
warnings.filterwarnings('ignore')

In [None]:
COLAB = False
SAMPLE = False
DRY_RUN = False
DATA_DIR = "./data/full/"
GROUP_NAME = "whole_dataset"
MODEL_LIST = ["NN", "LR"]
RESULTS_DIR = "./results/full/"
RESULTS_STATS_FILENAME = GROUP_NAME + '.csv'
RESULTS_MODEL_FILENAME_PREFIX = GROUP_NAME

logger.debug(f"Started the script for {GROUP_NAME}.")

if SAMPLE:
  DATA_DIR = "./data/sample/"
  RESULTS_DIR = "./results/sample/"

DATA_DROP_COLS = \
           ['Unnamed: 0', # index columns
           'Unnamed: 0.1', # TODO: might need to change this for bigger dataset. ,
              # 'pol',  # label column
              # 'gender', # self reported and filtered already
              # 'age', # self-reported  
              # 'country', # self reported and filtered already
              'userid', # index equivalent column 
              'pol_dat_us', # redundant columns with label
              'pol_dat_ca', # redundant columns with label
              'pol_dat_uk', # redundant columns with label
              'pol_fb_us', # redundant columns with label
              # 'database', # filtered already 
              # 'ethnicity.value' # filtered already
              ]

In [None]:
if COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
    
  #TODO: untested for COLAB == True, more might be needed here
  DATA_DIR = "/content/drive/Shareddrives/Facial Recognition/data/"

In [None]:
# Read datasets

folders = os.listdir(DATA_DIR)
dataframes = []
for folder in tqdm(folders):
  logger.debug(f"In folder {folder}.")
  csv_files = os.listdir(DATA_DIR + folder)
  for csv in csv_files:
    if '.csv' in csv:
      logger.debug(DATA_DIR + folder + "/" + csv)
      df = pd.read_csv(DATA_DIR + folder + "/" + csv)
      dataframes.append(df)

data = pd.concat(dataframes, axis = 0)
del df, dataframes
print(data.shape)

In [None]:
# Clean the data
try:
  data = data.drop(DATA_DROP_COLS, axis=1)
except:
  pass
data = utils.get_clean_data(data)
data_y = data['pol'].replace({"liberal": 1, "conservative": 0})
data = data.drop('pol', axis = 1)
all_features = data.columns

X_train, X_test, y_train, y_test = train_test_split(data, data_y, test_size = 0.2) 
# TODO: check reproducibility of splits so that additional metrics computed post-hoc
# are consistent. See https://stackoverflow.com/questions/53182821/scikit-learn-train-test-split-not-reproducible

del data, data_y


# Define features for the various settings
image_cols = list(map(str, range(1, 2049)))
image_and_self_reported_cols = image_cols + ['age']
image_and_extracted_cols = [x for x in all_features if x != "age"]
image_and_self_reported_and_extracted_cols = all_features

data_dict = {
"Image Features" : image_cols,
"Image and Self Reported Features" : image_and_self_reported_cols,
"Image and Extracted Features": image_and_extracted_cols,
"Image, Self-reported and Extracted Features": image_and_self_reported_and_extracted_cols
}

# Printing out non image feature differences between the settings
for data_name, data_set_features in tqdm(data_dict.items()):
  print([x for x in data_set_features if x.isnumeric() is False])


In [None]:
# Fit models and log results



# Fit models
results = []
for data_name, data_set_features in tqdm(data_dict.items()):
  for model_name in MODEL_LIST:
    try:
      logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}: model training started.")
      auc, acc, model = utils.fit_and_get_metrics(model_name,
                                                  X_train[data_set_features],
                                                  y_train,
                                                  X_test[data_set_features],
                                                  y_test,
                                                  dry_run = DRY_RUN)
      utils.save_model(model, model_name, RESULTS_DIR \
                       + RESULTS_MODEL_FILENAME_PREFIX \
                       + "_" + model_name \
                       + "_" + data_name.replace(" ","_").replace(",","").replace("-","_") \
                       + '.mdl')
      results.append([GROUP_NAME, model_name, data_name, auc, acc])
      logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}: model training ended. AUC: {auc}, accuracy: {acc}")
    except:
      logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}: Error occured!")
  
utils.save_results(results_array = results, location = RESULTS_DIR + RESULTS_STATS_FILENAME)
logger.debug(f"Script for {GROUP_NAME} finished.")