In [None]:
import os
import random 
import pandas as pd
import warnings
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from loguru import logger

# local imports
sys.path.append(r"./utils")
from utils import utils

random.seed(1234)
warnings.filterwarnings('ignore')

In [None]:
COLAB = False
SAMPLE = True
DRY_RUN = False
DATA_DIR = "./data/full/"
GROUP_NAME = "Self_Reported_Segments"
MODEL_LIST = ["NN", "LR"]
RESULTS_DIR = ".results/full/Self_Reported_Segments/"
RESULTS_STATS_FILENAME = GROUP_NAME + '.csv'
RESULTS_MODEL_FILENAME_PREFIX = GROUP_NAME
RESULTS = []

MODEL_RESULTS = './results/Model results/'

logger.debug(f"Started the script for {GROUP_NAME}.")

if SAMPLE:
  DATA_DIR = "./data/sample/"
  RESULTS_DIR = "./results/sample/Self_Reported_Segments/"

DATA_DROP_COLS = \
           ['Unnamed: 0', # index columns
           'Unnamed: 0.1', # TODO: might need to change this for bigger dataset. ,
              # 'pol',  # label column
              'gender', # self reported and filtered already
              # 'age', # self-reported  
              'country', # self reported and filtered already
              'userid', # index equivalent column 
              'pol_dat_us', # redundant columns with label
              'pol_dat_ca', # redundant columns with label
              'pol_dat_uk', # redundant columns with label
              'pol_fb_us', # redundant columns with label
              'database', # filtered already 
              ]

In [None]:
if COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
    
  #TODO: untested for COLAB == True, more might be needed here
  DATA_DIR = "/content/drive/Shareddrives/Facial Recognition/data/"

In [4]:
# List of data file paths

folders = os.listdir(DATA_DIR)

for folder in tqdm(folders):
  
  if folder != "NO FILES":

    data = utils.get_segment_dataframe(data_dir = DATA_DIR, segment_to_run = folder)

    # Clean the data
    try:
      data = data.drop(DATA_DROP_COLS, axis=1)
    except:
      pass

    data = utils.get_clean_data(data)
    data_y = data['pol'].replace({"liberal": 1, "conservative": 0})
    data = data.drop('pol', axis = 1)
    all_features = data.columns

    X_train, X_test, y_train, y_test = train_test_split(data, data_y, test_size = 0.2, random_state=1234) 
    # TODO: check reproducibility of splits so that additional metrics computed post-hoc
    # are consistent. See https://stackoverflow.com/questions/53182821/scikit-learn-train-test-split-not-reproducible

    del data, data_y

    # Define features for the various settings
    image_cols = list(map(str, range(1, 2049)))
    self_reported_cols = ['age']
    image_and_self_reported_cols = image_cols + self_reported_cols
    image_and_extracted_cols = [x for x in all_features if x not in self_reported_cols]
    image_and_self_reported_and_extracted_cols = all_features

    # which group it is being processed on 
    SEGMENT_NAME = folder
    logger.debug(f"Started the script for {SEGMENT_NAME}.")

    data_dict = {
                "Image Features" : image_cols,
                "Image and Self Reported Features" : image_and_self_reported_cols,
                "Image and Extracted Features": image_and_extracted_cols,
                "Image, Self-reported and Extracted Features": image_and_self_reported_and_extracted_cols
                }

    # Fit models
    for data_name, data_set_features in tqdm(data_dict.items()):
      for model_name in MODEL_LIST:
        try:
          logger.debug(f"{SEGMENT_NAME}, {model_name}, {data_name}: model training started.")
          auc, acc, model = utils.fit_and_get_metrics(model_name,
                                                      X_train[data_set_features],
                                                      y_train,
                                                      X_test[data_set_features],
                                                      y_test,
                                                      dry_run = DRY_RUN)
          utils.save_model(model, model_name, RESULTS_DIR \
                          + SEGMENT_NAME \
                          + "_" + model_name \
                          + "_" + data_name.replace(" ","_").replace(",","").replace("-","_") \
                          + '.mdl')
          RESULTS.append([SEGMENT_NAME, model_name, data_name, auc, acc])
          logger.debug(f"{SEGMENT_NAME}, {model_name}, {data_name}: model training ended. AUC: {auc}, accuracy: {acc}")
        except:
          logger.debug(f"{SEGMENT_NAME}, {model_name}, {data_name}: Error occured!")
    
utils.save_results(results_array = RESULTS, location = MODEL_RESULTS + RESULTS_STATS_FILENAME)
logger.debug(f"Script for {GROUP_NAME} finished.")
    

Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


2021-08-15 15:59:48.849 | DEBUG    | __main__:<module>:63 - US_1_FB, NN, Image and Extracted Features: model training ended. AUC: 62.28, accuracy: 70.38
2021-08-15 15:59:48.849 | DEBUG    | __main__:<module>:50 - US_1_FB, LR, Image and Extracted Features: model training started.
2021-08-15 15:59:54.635 | DEBUG    | __main__:<module>:63 - US_1_FB, LR, Image and Extracted Features: model training ended. AUC: 70.14, accuracy: 74.12
2021-08-15 15:59:54.637 | DEBUG    | __main__:<module>:50 - US_1_FB, NN, Image, Self-reported and Extracted Features: model training started.


Train on 2560 samples, validate on 640 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


2021-08-15 16:00:04.351 | DEBUG    | __main__:<module>:63 - US_1_FB, NN, Image, Self-reported and Extracted Features: model training ended. AUC: 59.24, accuracy: 74.5
2021-08-15 16:00:04.352 | DEBUG    | __main__:<module>:50 - US_1_FB, LR, Image, Self-reported and Extracted Features: model training started.
2021-08-15 16:00:10.284 | DEBUG    | __main__:<module>:63 - US_1_FB, LR, Image, Self-reported and Extracted Features: model training ended. AUC: 70.39, accuracy: 74.25
100%|██████████| 4/4 [01:04<00:00, 16.13s/it]
 78%|███████▊  | 7/9 [08:58<02:19, 69.76s/it]2021-08-15 16:00:12.332 | DEBUG    | __main__:<module>:37 - Started the script for US_1_dating.
2021-08-15 16:00:12.335 | DEBUG    | __main__:<module>:50 - US_1_dating, NN, Image Features: model training started.


Train on 2560 samples, validate on 640 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


2021-08-15 16:00:21.964 | DEBUG    | __main__:<module>:63 - US_1_dating, NN, Image Features: model training ended. AUC: 61.75, accuracy: 59.75
2021-08-15 16:00:21.965 | DEBUG    | __main__:<module>:50 - US_1_dating, LR, Image Features: model training started.
2021-08-15 16:00:28.228 | DEBUG    | __main__:<module>:63 - US_1_dating, LR, Image Features: model training ended. AUC: 58.99, accuracy: 57.88
2021-08-15 16:00:28.230 | DEBUG    | __main__:<module>:50 - US_1_dating, NN, Image and Self Reported Features: model training started.


Train on 2560 samples, validate on 640 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


2021-08-15 16:00:37.493 | DEBUG    | __main__:<module>:63 - US_1_dating, NN, Image and Self Reported Features: model training ended. AUC: 64.94, accuracy: 62.75
2021-08-15 16:00:37.494 | DEBUG    | __main__:<module>:50 - US_1_dating, LR, Image and Self Reported Features: model training started.
2021-08-15 16:00:43.633 | DEBUG    | __main__:<module>:63 - US_1_dating, LR, Image and Self Reported Features: model training ended. AUC: 59.91, accuracy: 58.13
2021-08-15 16:00:43.634 | DEBUG    | __main__:<module>:50 - US_1_dating, NN, Image and Extracted Features: model training started.


Train on 2560 samples, validate on 640 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


2021-08-15 16:00:53.186 | DEBUG    | __main__:<module>:63 - US_1_dating, NN, Image and Extracted Features: model training ended. AUC: 62.82, accuracy: 59.75
2021-08-15 16:00:53.187 | DEBUG    | __main__:<module>:50 - US_1_dating, LR, Image and Extracted Features: model training started.
2021-08-15 16:00:58.905 | DEBUG    | __main__:<module>:63 - US_1_dating, LR, Image and Extracted Features: model training ended. AUC: 63.49, accuracy: 60.0
2021-08-15 16:00:58.907 | DEBUG    | __main__:<module>:50 - US_1_dating, NN, Image, Self-reported and Extracted Features: model training started.


Train on 2560 samples, validate on 640 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


2021-08-15 16:01:09.033 | DEBUG    | __main__:<module>:63 - US_1_dating, NN, Image, Self-reported and Extracted Features: model training ended. AUC: 61.41, accuracy: 55.87
2021-08-15 16:01:09.033 | DEBUG    | __main__:<module>:50 - US_1_dating, LR, Image, Self-reported and Extracted Features: model training started.
2021-08-15 16:01:14.732 | DEBUG    | __main__:<module>:63 - US_1_dating, LR, Image, Self-reported and Extracted Features: model training ended. AUC: 63.5, accuracy: 60.0
100%|██████████| 4/4 [01:02<00:00, 15.60s/it]
 89%|████████▉ | 8/9 [10:03<01:08, 68.20s/it]2021-08-15 16:01:16.735 | DEBUG    | __main__:<module>:37 - Started the script for US_0_FB.
2021-08-15 16:01:16.736 | DEBUG    | __main__:<module>:50 - US_0_FB, NN, Image Features: model training started.


Train on 2560 samples, validate on 640 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


2021-08-15 16:01:26.577 | DEBUG    | __main__:<module>:63 - US_0_FB, NN, Image Features: model training ended. AUC: 69.87, accuracy: 70.88
2021-08-15 16:01:26.578 | DEBUG    | __main__:<module>:50 - US_0_FB, LR, Image Features: model training started.
2021-08-15 16:01:32.835 | DEBUG    | __main__:<module>:63 - US_0_FB, LR, Image Features: model training ended. AUC: 63.54, accuracy: 66.0
2021-08-15 16:01:32.837 | DEBUG    | __main__:<module>:50 - US_0_FB, NN, Image and Self Reported Features: model training started.


Train on 2560 samples, validate on 640 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


2021-08-15 16:01:42.361 | DEBUG    | __main__:<module>:63 - US_0_FB, NN, Image and Self Reported Features: model training ended. AUC: 69.73, accuracy: 70.63
2021-08-15 16:01:42.361 | DEBUG    | __main__:<module>:50 - US_0_FB, LR, Image and Self Reported Features: model training started.
2021-08-15 16:01:48.711 | DEBUG    | __main__:<module>:63 - US_0_FB, LR, Image and Self Reported Features: model training ended. AUC: 63.98, accuracy: 66.25
2021-08-15 16:01:48.713 | DEBUG    | __main__:<module>:50 - US_0_FB, NN, Image and Extracted Features: model training started.


Train on 2560 samples, validate on 640 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


2021-08-15 16:01:58.660 | DEBUG    | __main__:<module>:63 - US_0_FB, NN, Image and Extracted Features: model training ended. AUC: 63.0, accuracy: 70.25
2021-08-15 16:01:58.661 | DEBUG    | __main__:<module>:50 - US_0_FB, LR, Image and Extracted Features: model training started.
2021-08-15 16:02:04.501 | DEBUG    | __main__:<module>:63 - US_0_FB, LR, Image and Extracted Features: model training ended. AUC: 72.1, accuracy: 71.5
2021-08-15 16:02:04.503 | DEBUG    | __main__:<module>:50 - US_0_FB, NN, Image, Self-reported and Extracted Features: model training started.


Train on 2560 samples, validate on 640 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


2021-08-15 16:02:15.180 | DEBUG    | __main__:<module>:63 - US_0_FB, NN, Image, Self-reported and Extracted Features: model training ended. AUC: 62.42, accuracy: 70.13
2021-08-15 16:02:15.181 | DEBUG    | __main__:<module>:50 - US_0_FB, LR, Image, Self-reported and Extracted Features: model training started.
2021-08-15 16:02:20.934 | DEBUG    | __main__:<module>:63 - US_0_FB, LR, Image, Self-reported and Extracted Features: model training ended. AUC: 72.18, accuracy: 71.88
100%|██████████| 4/4 [01:04<00:00, 16.05s/it]
100%|██████████| 9/9 [11:09<00:00, 74.36s/it]
2021-08-15 16:02:20.951 | DEBUG    | utils.utils:save_results:144 - Results Saved.
2021-08-15 16:02:20.952 | DEBUG    | __main__:<module>:68 - Script for Self_Reported_Segments finished.
