In [1]:
# Imports

import os
import pandas as pd
import sys
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from loguru import logger
from pathlib import Path

# Local imports
sys.path.append(r"./utils")
from utils import utils

Using TensorFlow backend.


In [2]:
# Constants

SAMPLE_DATA = True
DEBUG_MODEL = True

GROUP_NAME = "ethnicity_groups"  # TODO : Changed this for ethnicity segments. 
MODEL_DICT = \
            {"NN": {'epochs': 25} , 
             "LR": {'max_iter': 100}
            }
SELF_REPORTED_COLS = \
           ['age',
            # 'country_canada', 
            # 'country_united kingdom',
            # 'country_united states',
            # 'database_dating',
            # 'database_fb',
            # 'gender',
           ]
DATA_DROP_COLS = \
           ['Unnamed: 0', # index columns
            # 'pol',  # label column
            'gender', # self reported and filtered already
            # 'age', # self-reported  
            'country', # self reported and filtered already
            'userid', # index equivalent column 
            'pol_dat_us', # redundant columns with label
            'pol_dat_ca', # redundant columns with label
            'pol_dat_uk', # redundant columns with label
            'pol_fb_us', # redundant columns with label
            'database', # filtered already 
            'ethnicity.value' # filtered already
            ]
RESULTS_COLS = \
            ["Group Name", 
             "Model", 
             "Feature Set", 
             "Test AUC", 
             "Test Accuracy",
            ]
DATA_DIR = "./data/full/"
RESULTS_DIR = f"./results/full/{GROUP_NAME}/" #TODO: ensure that the folder exists
RESULTS_STATS_FILENAME = GROUP_NAME + '.csv'
RESULTS_MODEL_FILENAME_PREFIX = GROUP_NAME

if SAMPLE_DATA:
  DATA_DIR = "./data/sample/"
  RESULTS_DIR = f"./results/sample/{GROUP_NAME}/"
  ASSERT_DATA_SHAPE_0 = 31742
  ASSERT_DATA_SHAPE_1 = 2092
  DATA_DROP_COLS = DATA_DROP_COLS \
                  + ['Unnamed: 0.1'] #TODO: Regenerate sample with index=False and remove this

if DEBUG_MODEL:
  MODEL_DICT = \
            {"NN": {'epochs': 1} , 
             "LR": {'max_iter': 1}
            }

logger.debug(f"Started the script for {GROUP_NAME}.")

2021-08-18 18:52:28.308 | DEBUG    | __main__:<module>:60 - Started the script for ethnicity_groups.


In [3]:
# Read datasets

folders = os.listdir(DATA_DIR)
dataset_paths = []
for folder in tqdm(folders):
  logger.debug(f"In folder {folder}.")
  csv_files = os.listdir(DATA_DIR + folder)
  for csv in csv_files:
    if '.csv' in csv:
      logger.debug(DATA_DIR + folder + "/" + csv)
      dataset_paths.append(DATA_DIR + folder + "/" + csv)


  0%|          | 0/9 [00:00<?, ?it/s]2021-08-18 18:52:28.343 | DEBUG    | __main__:<module>:6 - In folder UK_0_dating.
2021-08-18 18:52:28.343 | DEBUG    | __main__:<module>:10 - ./data/sample/UK_0_dating/segment_united kingdom_0_dating_black.csv
2021-08-18 18:52:28.344 | DEBUG    | __main__:<module>:10 - ./data/sample/UK_0_dating/segment_united kingdom_0_dating_asian.csv
2021-08-18 18:52:28.345 | DEBUG    | __main__:<module>:10 - ./data/sample/UK_0_dating/segment_united kingdom_0_dating_india.csv
2021-08-18 18:52:28.345 | DEBUG    | __main__:<module>:10 - ./data/sample/UK_0_dating/segment_united kingdom_0_dating_white.csv
2021-08-18 18:52:28.345 | DEBUG    | __main__:<module>:6 - In folder Canada_1_dating.
2021-08-18 18:52:28.346 | DEBUG    | __main__:<module>:10 - ./data/sample/Canada_1_dating/segment_canada_1_dating_asian.csv
2021-08-18 18:52:28.346 | DEBUG    | __main__:<module>:10 - ./data/sample/Canada_1_dating/segment_canada_1_dating_india.csv
2021-08-18 18:52:28.347 | DEBUG    

In [4]:
results = []

for dataset_path in dataset_paths:
  data = pd.read_csv(dataset_path)
  
  # Clean the data
  data = data.drop(DATA_DROP_COLS, axis=1)
  data = utils.get_clean_data(data)
  logger.debug(f"Data size after cleaning is {data.shape}")
  data_y = data['pol'].replace({"liberal": 1, "conservative": 0})
  data = data.drop('pol', axis = 1)
  all_features = data.columns

  # Split the data
  X_train, X_test, y_train, y_test = train_test_split(data, data_y, test_size = 0.2, random_state = 2021) 
  del data, data_y

  # which group it is being processed on 
  SEGMENT_NAME = utils.get_dataframe_name(dataset_path)
  logger.debug(f"Started the script for {SEGMENT_NAME}.")
  
  ## Define features for the various settings
  image_cols = list(map(str, range(1, 2049)))
  image_and_self_reported_cols = image_cols + SELF_REPORTED_COLS
  image_and_extracted_cols = [x for x in all_features if x not in SELF_REPORTED_COLS]
  image_and_self_reported_and_extracted_cols = all_features

  if SEGMENT_NAME == "canada_1_dating_india": # this segment does not have any "age" values associated with it.
    ## Define features for the various settings
    image_cols = list(map(str, range(1, 2049)))
    image_and_self_reported_cols = image_cols # + SELF_REPORTED_COLS 
    image_and_extracted_cols = [x for x in all_features if x not in SELF_REPORTED_COLS]
    image_and_self_reported_and_extracted_cols = all_features

  
  data_dict = {
              "Image Features" : image_cols,
              "Image and Self Reported Features" : image_and_self_reported_cols,
              "Image and Extracted Features": image_and_extracted_cols,
              "Image, Self-reported and Extracted Features": image_and_self_reported_and_extracted_cols
              }


  # Fit models and log results
  for data_name, data_set_features in tqdm(data_dict.items()):
    for model_name, model_params in MODEL_DICT.items():
      try:
        save_model_filepath = Path(RESULTS_DIR \
                        + SEGMENT_NAME\
                        + "_" + model_name \
                        + "_" + data_name.replace(" ","_").replace(",","").replace("-","_") \
                        + '.mdl')

        if save_model_filepath.is_file() or save_model_filepath.is_dir():
          logger.debug(f"{SEGMENT_NAME}, {model_name}, {data_name}: model already exists.")
          model = utils.read_model(model_name, save_model_filepath)
          logger.debug(f"{SEGMENT_NAME}, {model_name}, {data_name}: model read from disk.")
          
        else:
          logger.debug(f"{SEGMENT_NAME}, {model_name}, {data_name}: model training started.")
          model = utils.fit_model(model_name,
                                X_train[data_set_features],
                                y_train,
                                model_params = model_params)

          utils.save_model(model, model_name, save_model_filepath)
          logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}: model training ended and model saved.")
        
        auc, acc = utils.get_metrics(model_name,
                                     model,
                                     X_test[data_set_features],
                                     y_test)
                                     
        results.append([GROUP_NAME, model_name, data_name, auc, acc])
        logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}: model training ended. AUC: {auc}, accuracy: {acc}")
      
      except Exception as error:
        logger.exception(error)
        logger.error(f"{SEGMENT_NAME}, {model_name}, {data_name}: Error occured!")

2021-08-18 18:52:29.467 | DEBUG    | __main__:<module>:9 - Data size after cleaning is (1000, 2076)
2021-08-18 18:52:29.474 | DEBUG    | __main__:<module>:20 - Started the script for united kingdom_0_dating_black.
  0%|          | 0/4 [00:00<?, ?it/s]2021-08-18 18:52:29.476 | DEBUG    | __main__:<module>:56 - united kingdom_0_dating_black, NN, Image Features: model already exists.


united kingdom_0_dating_black


2021-08-18 18:52:29.847 | DEBUG    | __main__:<module>:58 - united kingdom_0_dating_black, NN, Image Features: model read from disk.
2021-08-18 18:52:30.098 | DEBUG    | __main__:<module>:76 - ethnicity_groups, NN, Image Features: model training ended. AUC: 58.07, accuracy: 63.5
2021-08-18 18:52:30.099 | DEBUG    | __main__:<module>:56 - united kingdom_0_dating_black, LR, Image Features: model already exists.
2021-08-18 18:52:30.101 | DEBUG    | __main__:<module>:58 - united kingdom_0_dating_black, LR, Image Features: model read from disk.
2021-08-18 18:52:30.127 | DEBUG    | __main__:<module>:76 - ethnicity_groups, LR, Image Features: model training ended. AUC: 57.8, accuracy: 58.0
 25%|██▌       | 1/4 [00:00<00:01,  1.53it/s]2021-08-18 18:52:30.128 | DEBUG    | __main__:<module>:56 - united kingdom_0_dating_black, NN, Image and Self Reported Features: model already exists.
2021-08-18 18:52:30.427 | DEBUG    | __main__:<module>:58 - united kingdom_0_dating_black, NN, Image and Self Re

united kingdom_0_dating_asian


2021-08-18 18:52:32.654 | DEBUG    | __main__:<module>:58 - united kingdom_0_dating_asian, NN, Image Features: model read from disk.
2021-08-18 18:52:32.887 | DEBUG    | __main__:<module>:76 - ethnicity_groups, NN, Image Features: model training ended. AUC: 54.63, accuracy: 49.0
2021-08-18 18:52:32.887 | DEBUG    | __main__:<module>:56 - united kingdom_0_dating_asian, LR, Image Features: model already exists.
2021-08-18 18:52:32.889 | DEBUG    | __main__:<module>:58 - united kingdom_0_dating_asian, LR, Image Features: model read from disk.
2021-08-18 18:52:32.911 | DEBUG    | __main__:<module>:76 - ethnicity_groups, LR, Image Features: model training ended. AUC: 64.87, accuracy: 62.0
 25%|██▌       | 1/4 [00:00<00:01,  1.73it/s]2021-08-18 18:52:32.912 | DEBUG    | __main__:<module>:56 - united kingdom_0_dating_asian, NN, Image and Self Reported Features: model already exists.
2021-08-18 18:52:33.297 | DEBUG    | __main__:<module>:58 - united kingdom_0_dating_asian, NN, Image and Self R

united kingdom_0_dating_india
Train on 640 samples, validate on 160 samples
Epoch 1/1


2021-08-18 18:52:36.337 | DEBUG    | __main__:<module>:68 - ethnicity_groups, NN, Image Features: model training ended and model saved.
2021-08-18 18:52:36.455 | DEBUG    | __main__:<module>:76 - ethnicity_groups, NN, Image Features: model training ended. AUC: 58.88, accuracy: 72.0
2021-08-18 18:52:36.455 | DEBUG    | __main__:<module>:61 - united kingdom_0_dating_india, LR, Image Features: model training started.
2021-08-18 18:52:36.578 | DEBUG    | __main__:<module>:68 - ethnicity_groups, LR, Image Features: model training ended and model saved.
2021-08-18 18:52:36.603 | DEBUG    | __main__:<module>:76 - ethnicity_groups, LR, Image Features: model training ended. AUC: 67.68, accuracy: 71.0
 25%|██▌       | 1/4 [00:01<00:03,  1.21s/it]2021-08-18 18:52:36.604 | DEBUG    | __main__:<module>:61 - united kingdom_0_dating_india, NN, Image and Self Reported Features: model training started.


Train on 640 samples, validate on 160 samples
Epoch 1/1


2021-08-18 18:52:37.609 | DEBUG    | __main__:<module>:68 - ethnicity_groups, NN, Image and Self Reported Features: model training ended and model saved.
2021-08-18 18:52:37.719 | DEBUG    | __main__:<module>:76 - ethnicity_groups, NN, Image and Self Reported Features: model training ended. AUC: 52.62, accuracy: 72.0
2021-08-18 18:52:37.720 | DEBUG    | __main__:<module>:61 - united kingdom_0_dating_india, LR, Image and Self Reported Features: model training started.
2021-08-18 18:52:37.842 | DEBUG    | __main__:<module>:68 - ethnicity_groups, LR, Image and Self Reported Features: model training ended and model saved.
2021-08-18 18:52:37.866 | DEBUG    | __main__:<module>:76 - ethnicity_groups, LR, Image and Self Reported Features: model training ended. AUC: 65.05, accuracy: 70.0
 50%|█████     | 2/4 [00:02<00:02,  1.24s/it]2021-08-18 18:52:37.867 | DEBUG    | __main__:<module>:61 - united kingdom_0_dating_india, NN, Image and Extracted Features: model training started.


Train on 640 samples, validate on 160 samples
Epoch 1/1


2021-08-18 18:52:38.813 | DEBUG    | __main__:<module>:68 - ethnicity_groups, NN, Image and Extracted Features: model training ended and model saved.
2021-08-18 18:52:38.924 | DEBUG    | __main__:<module>:76 - ethnicity_groups, NN, Image and Extracted Features: model training ended. AUC: 55.0, accuracy: 72.0
2021-08-18 18:52:38.925 | DEBUG    | __main__:<module>:61 - united kingdom_0_dating_india, LR, Image and Extracted Features: model training started.
2021-08-18 18:52:39.058 | DEBUG    | __main__:<module>:68 - ethnicity_groups, LR, Image and Extracted Features: model training ended and model saved.
2021-08-18 18:52:39.082 | DEBUG    | __main__:<module>:76 - ethnicity_groups, LR, Image and Extracted Features: model training ended. AUC: 63.67, accuracy: 72.5
 75%|███████▌  | 3/4 [00:03<00:01,  1.23s/it]2021-08-18 18:52:39.083 | DEBUG    | __main__:<module>:61 - united kingdom_0_dating_india, NN, Image, Self-reported and Extracted Features: model training started.


Train on 640 samples, validate on 160 samples
Epoch 1/1


2021-08-18 18:52:40.044 | DEBUG    | __main__:<module>:68 - ethnicity_groups, NN, Image, Self-reported and Extracted Features: model training ended and model saved.
2021-08-18 18:52:40.157 | DEBUG    | __main__:<module>:76 - ethnicity_groups, NN, Image, Self-reported and Extracted Features: model training ended. AUC: 47.3, accuracy: 72.0
2021-08-18 18:52:40.157 | DEBUG    | __main__:<module>:61 - united kingdom_0_dating_india, LR, Image, Self-reported and Extracted Features: model training started.
2021-08-18 18:52:40.283 | DEBUG    | __main__:<module>:68 - ethnicity_groups, LR, Image, Self-reported and Extracted Features: model training ended and model saved.
2021-08-18 18:52:40.310 | DEBUG    | __main__:<module>:76 - ethnicity_groups, LR, Image, Self-reported and Extracted Features: model training ended. AUC: 63.01, accuracy: 71.5
100%|██████████| 4/4 [00:04<00:00,  1.23s/it]
2021-08-18 18:52:40.801 | DEBUG    | __main__:<module>:9 - Data size after cleaning is (1000, 2076)
2021-08-1

united kingdom_0_dating_white
Train on 640 samples, validate on 160 samples
Epoch 1/1


2021-08-18 18:52:41.881 | DEBUG    | __main__:<module>:68 - ethnicity_groups, NN, Image Features: model training ended and model saved.
2021-08-18 18:52:41.999 | DEBUG    | __main__:<module>:76 - ethnicity_groups, NN, Image Features: model training ended. AUC: 54.45, accuracy: 54.5
2021-08-18 18:52:42.000 | DEBUG    | __main__:<module>:61 - united kingdom_0_dating_white, LR, Image Features: model training started.
2021-08-18 18:52:42.124 | DEBUG    | __main__:<module>:68 - ethnicity_groups, LR, Image Features: model training ended and model saved.
2021-08-18 18:52:42.148 | DEBUG    | __main__:<module>:76 - ethnicity_groups, LR, Image Features: model training ended. AUC: 63.17, accuracy: 62.5
 25%|██▌       | 1/4 [00:01<00:04,  1.34s/it]2021-08-18 18:52:42.149 | DEBUG    | __main__:<module>:61 - united kingdom_0_dating_white, NN, Image and Self Reported Features: model training started.


Train on 640 samples, validate on 160 samples
Epoch 1/1


2021-08-18 18:52:43.200 | DEBUG    | __main__:<module>:68 - ethnicity_groups, NN, Image and Self Reported Features: model training ended and model saved.
2021-08-18 18:52:43.343 | DEBUG    | __main__:<module>:76 - ethnicity_groups, NN, Image and Self Reported Features: model training ended. AUC: 53.79, accuracy: 59.5
2021-08-18 18:52:43.343 | DEBUG    | __main__:<module>:61 - united kingdom_0_dating_white, LR, Image and Self Reported Features: model training started.
2021-08-18 18:52:43.466 | DEBUG    | __main__:<module>:68 - ethnicity_groups, LR, Image and Self Reported Features: model training ended and model saved.
2021-08-18 18:52:43.491 | DEBUG    | __main__:<module>:76 - ethnicity_groups, LR, Image and Self Reported Features: model training ended. AUC: 64.96, accuracy: 65.0
 50%|█████     | 2/4 [00:02<00:02,  1.34s/it]2021-08-18 18:52:43.492 | DEBUG    | __main__:<module>:61 - united kingdom_0_dating_white, NN, Image and Extracted Features: model training started.


In [None]:
# Save results summary to disk

save_results_filepath = Path(RESULTS_DIR + RESULTS_STATS_FILENAME)
utils.save_results(results_array = results, 
                     location = save_results_filepath,
                     columns = RESULTS_COLS)
print(pd.DataFrame(results,columns = RESULTS_COLS))
logger.debug(f"Script for {GROUP_NAME} finished.")