In [1]:
# Imports

import os
import pandas as pd
import sys
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from loguru import logger
from pathlib import Path

# Local imports
sys.path.append(r"./utils")
from utils import utils

Using TensorFlow backend.


In [2]:
# Constants

SAMPLE_DATA = True
DEBUG_MODEL = True

GROUP_NAME = "ethnicity_groups"  # TODO : Changed this for ethnicity segments. 
MODEL_DICT = \
            {"NN": {'epochs': 25} , 
             "LR": {'max_iter': 100}
            }
SELF_REPORTED_COLS = \
           ['age',
            # 'country_canada', 
            # 'country_united kingdom',
            # 'country_united states',
            # 'database_dating',
            # 'database_fb',
            # 'gender',
           ]
DATA_DROP_COLS = \
           ['Unnamed: 0', # index columns
            # 'pol',  # label column
            'gender', # self reported and filtered already
            # 'age', # self-reported  
            'country', # self reported and filtered already
            'userid', # index equivalent column 
            'pol_dat_us', # redundant columns with label
            'pol_dat_ca', # redundant columns with label
            'pol_dat_uk', # redundant columns with label
            'pol_fb_us', # redundant columns with label
            'database', # filtered already 
            'ethnicity.value' # filtered already
            ]
RESULTS_COLS = \
            ["Group Name", 
             "Model", 
             "Feature Set", 
             "Test AUC", 
             "Test Accuracy",
            ]
DATA_DIR = "./data/full/"
RESULTS_DIR = f"./results/full/{GROUP_NAME}/" #TODO: ensure that the folder exists
RESULTS_STATS_FILENAME = GROUP_NAME + '.csv'
RESULTS_MODEL_FILENAME_PREFIX = GROUP_NAME

if SAMPLE_DATA:
  DATA_DIR = "./data/sample/"
  RESULTS_DIR = f"./results/sample/{GROUP_NAME}/"
  ASSERT_DATA_SHAPE_0 = 31742
  ASSERT_DATA_SHAPE_1 = 2092
  DATA_DROP_COLS = DATA_DROP_COLS \
                  + ['Unnamed: 0.1'] #TODO: Regenerate sample with index=False and remove this

if DEBUG_MODEL:
  MODEL_DICT = \
            {"NN": {'epochs': 1} , 
             "LR": {'max_iter': 1}
            }

logger.debug(f"Started the script for {GROUP_NAME}.")

2021-08-18 19:04:47.713 | DEBUG    | __main__:<module>:60 - Started the script for ethnicity_groups.


In [3]:
# Read datasets

folders = os.listdir(DATA_DIR)
dataset_paths = []
for folder in tqdm(folders):
  logger.debug(f"In folder {folder}.")
  csv_files = os.listdir(DATA_DIR + folder)
  for csv in csv_files:
    if '.csv' in csv:
      logger.debug(DATA_DIR + folder + "/" + csv)
      dataset_paths.append(DATA_DIR + folder + "/" + csv)


  0%|          | 0/9 [00:00<?, ?it/s]2021-08-18 19:04:47.747 | DEBUG    | __main__:<module>:6 - In folder UK_0_dating.
2021-08-18 19:04:47.748 | DEBUG    | __main__:<module>:10 - ./data/sample/UK_0_dating/segment_united kingdom_0_dating_black.csv
2021-08-18 19:04:47.748 | DEBUG    | __main__:<module>:10 - ./data/sample/UK_0_dating/segment_united kingdom_0_dating_asian.csv
2021-08-18 19:04:47.749 | DEBUG    | __main__:<module>:10 - ./data/sample/UK_0_dating/segment_united kingdom_0_dating_india.csv
2021-08-18 19:04:47.749 | DEBUG    | __main__:<module>:10 - ./data/sample/UK_0_dating/segment_united kingdom_0_dating_white.csv
2021-08-18 19:04:47.749 | DEBUG    | __main__:<module>:6 - In folder Canada_1_dating.
2021-08-18 19:04:47.750 | DEBUG    | __main__:<module>:10 - ./data/sample/Canada_1_dating/segment_canada_1_dating_asian.csv
2021-08-18 19:04:47.750 | DEBUG    | __main__:<module>:10 - ./data/sample/Canada_1_dating/segment_canada_1_dating_india.csv
2021-08-18 19:04:47.751 | DEBUG    

In [4]:
results = []

for dataset_path in dataset_paths:
  data = pd.read_csv(dataset_path)
  
  # Clean the data
  data = data.drop(DATA_DROP_COLS, axis=1)
  data = utils.get_clean_data(data)
  logger.debug(f"Data size after cleaning is {data.shape}")
  data_y = data['pol'].replace({"liberal": 1, "conservative": 0})
  data = data.drop('pol', axis = 1)
  all_features = data.columns

  # Split the data
  X_train, X_test, y_train, y_test = train_test_split(data, data_y, test_size = 0.2, random_state = 2021) 
  del data, data_y

  # which group it is being processed on 
  SEGMENT_NAME = utils.get_dataframe_name(dataset_path)
  logger.debug(f"Started the script for {SEGMENT_NAME}.")
  
  ## Define features for the various settings
  image_cols = list(map(str, range(1, 2049)))
  image_and_self_reported_cols = image_cols + SELF_REPORTED_COLS
  image_and_extracted_cols = [x for x in all_features if x not in SELF_REPORTED_COLS]
  image_and_self_reported_and_extracted_cols = all_features

  data_dict = {
                "Image Features" : image_cols,
                "Image and Self Reported Features" : image_and_self_reported_cols,
                "Image and Extracted Features": image_and_extracted_cols,
                "Image, Self-reported and Extracted Features": image_and_self_reported_and_extracted_cols
                }

  if SEGMENT_NAME == "canada_1_dating_india": # this segment does not have any "age" values associated with it.
    ## Define features for the various settings
    image_cols = list(map(str, range(1, 2049)))
    # image_and_self_reported_cols = image_cols + SELF_REPORTED_COLS 
    image_and_extracted_cols = [x for x in all_features if x not in SELF_REPORTED_COLS]
    image_and_self_reported_and_extracted_cols = all_features

  
    data_dict = {
                "Image Features" : image_cols,
                # "Image and Self Reported Features" : image_and_self_reported_cols,
                "Image and Extracted Features": image_and_extracted_cols,
                "Image, Self-reported and Extracted Features": image_and_self_reported_and_extracted_cols
                }


  # Fit models and log results
  for data_name, data_set_features in tqdm(data_dict.items()):
    for model_name, model_params in MODEL_DICT.items():
      try:
        save_model_filepath = Path(RESULTS_DIR \
                        + SEGMENT_NAME\
                        + "_" + model_name \
                        + "_" + data_name.replace(" ","_").replace(",","").replace("-","_") \
                        + '.mdl')

        if save_model_filepath.is_file() or save_model_filepath.is_dir():
          logger.debug(f"{SEGMENT_NAME}, {model_name}, {data_name}: model already exists.")
          model = utils.read_model(model_name, save_model_filepath)
          logger.debug(f"{SEGMENT_NAME}, {model_name}, {data_name}: model read from disk.")
          
        else:
          logger.debug(f"{SEGMENT_NAME}, {model_name}, {data_name}: model training started.")
          model = utils.fit_model(model_name,
                                X_train[data_set_features],
                                y_train,
                                model_params = model_params)

          utils.save_model(model, model_name, save_model_filepath)
          logger.debug(f"{GROUP_NAME}, {model_name}, {data_name}: model training ended and model saved.")
        
        auc, acc = utils.get_metrics(model_name,
                                     model,
                                     X_test[data_set_features],
                                     y_test)
                                     
        results.append([SEGMENT_NAME, model_name, data_name, auc, acc])
        logger.debug(f"{SEGMENT_NAME}, {model_name}, {data_name}: model training ended. AUC: {auc}, accuracy: {acc}")
      
      except Exception as error:
        logger.exception(error)
        logger.error(f"{SEGMENT_NAME}, {model_name}, {data_name}: Error occured!")

2021-08-18 19:04:48.851 | DEBUG    | __main__:<module>:9 - Data size after cleaning is (1000, 2076)
2021-08-18 19:04:48.857 | DEBUG    | __main__:<module>:20 - Started the script for united kingdom_0_dating_black.
  0%|          | 0/4 [00:00<?, ?it/s]2021-08-18 19:04:48.859 | DEBUG    | __main__:<module>:62 - united kingdom_0_dating_black, NN, Image Features: model already exists.
2021-08-18 19:04:49.229 | DEBUG    | __main__:<module>:64 - united kingdom_0_dating_black, NN, Image Features: model read from disk.
2021-08-18 19:04:49.475 | DEBUG    | __main__:<module>:82 - united kingdom_0_dating_black, NN, Image Features: model training ended. AUC: 51.74, accuracy: 61.5
2021-08-18 19:04:49.476 | DEBUG    | __main__:<module>:62 - united kingdom_0_dating_black, LR, Image Features: model already exists.
2021-08-18 19:04:49.478 | DEBUG    | __main__:<module>:64 - united kingdom_0_dating_black, LR, Image Features: model read from disk.
2021-08-18 19:04:49.504 | DEBUG    | __main__:<module>:82

Train on 640 samples, validate on 160 samples
Epoch 1/1


2021-08-18 19:04:50.476 | DEBUG    | __main__:<module>:74 - ethnicity_groups, NN, Image and Self Reported Features: model training ended and model saved.
2021-08-18 19:04:50.599 | DEBUG    | __main__:<module>:82 - united kingdom_0_dating_black, NN, Image and Self Reported Features: model training ended. AUC: 55.12, accuracy: 61.5
2021-08-18 19:04:50.600 | DEBUG    | __main__:<module>:67 - united kingdom_0_dating_black, LR, Image and Self Reported Features: model training started.
2021-08-18 19:04:50.727 | DEBUG    | __main__:<module>:74 - ethnicity_groups, LR, Image and Self Reported Features: model training ended and model saved.
2021-08-18 19:04:50.751 | DEBUG    | __main__:<module>:82 - united kingdom_0_dating_black, LR, Image and Self Reported Features: model training ended. AUC: 68.2, accuracy: 64.5
 50%|█████     | 2/4 [00:01<00:02,  1.00s/it]2021-08-18 19:04:50.753 | DEBUG    | __main__:<module>:62 - united kingdom_0_dating_black, NN, Image and Extracted Features: model already 

Train on 640 samples, validate on 160 samples
Epoch 1/1


2021-08-18 19:04:54.554 | DEBUG    | __main__:<module>:74 - ethnicity_groups, NN, Image and Self Reported Features: model training ended and model saved.
2021-08-18 19:04:54.674 | DEBUG    | __main__:<module>:82 - united kingdom_0_dating_asian, NN, Image and Self Reported Features: model training ended. AUC: 58.81, accuracy: 55.0
2021-08-18 19:04:54.675 | DEBUG    | __main__:<module>:67 - united kingdom_0_dating_asian, LR, Image and Self Reported Features: model training started.
2021-08-18 19:04:54.793 | DEBUG    | __main__:<module>:74 - ethnicity_groups, LR, Image and Self Reported Features: model training ended and model saved.
2021-08-18 19:04:54.817 | DEBUG    | __main__:<module>:82 - united kingdom_0_dating_asian, LR, Image and Self Reported Features: model training ended. AUC: 67.14, accuracy: 62.5
 50%|█████     | 2/4 [00:02<00:02,  1.17s/it]2021-08-18 19:04:54.818 | DEBUG    | __main__:<module>:62 - united kingdom_0_dating_asian, NN, Image and Extracted Features: model already

Train on 640 samples, validate on 160 samples
Epoch 1/1


2021-08-18 19:04:58.108 | DEBUG    | __main__:<module>:74 - ethnicity_groups, NN, Image and Self Reported Features: model training ended and model saved.
2021-08-18 19:04:58.223 | DEBUG    | __main__:<module>:82 - united kingdom_0_dating_india, NN, Image and Self Reported Features: model training ended. AUC: 48.18, accuracy: 71.5
2021-08-18 19:04:58.224 | DEBUG    | __main__:<module>:67 - united kingdom_0_dating_india, LR, Image and Self Reported Features: model training started.
2021-08-18 19:04:58.346 | DEBUG    | __main__:<module>:74 - ethnicity_groups, LR, Image and Self Reported Features: model training ended and model saved.
2021-08-18 19:04:58.369 | DEBUG    | __main__:<module>:82 - united kingdom_0_dating_india, LR, Image and Self Reported Features: model training ended. AUC: 66.56, accuracy: 69.5
 50%|█████     | 2/4 [00:01<00:01,  1.03it/s]2021-08-18 19:04:58.370 | DEBUG    | __main__:<module>:62 - united kingdom_0_dating_india, NN, Image and Extracted Features: model already

Train on 640 samples, validate on 160 samples
Epoch 1/1


2021-08-18 19:05:01.550 | DEBUG    | __main__:<module>:74 - ethnicity_groups, NN, Image and Self Reported Features: model training ended and model saved.
2021-08-18 19:05:01.657 | DEBUG    | __main__:<module>:82 - united kingdom_0_dating_white, NN, Image and Self Reported Features: model training ended. AUC: 53.26, accuracy: 54.5
2021-08-18 19:05:01.657 | DEBUG    | __main__:<module>:67 - united kingdom_0_dating_white, LR, Image and Self Reported Features: model training started.
2021-08-18 19:05:01.781 | DEBUG    | __main__:<module>:74 - ethnicity_groups, LR, Image and Self Reported Features: model training ended and model saved.
2021-08-18 19:05:01.803 | DEBUG    | __main__:<module>:82 - united kingdom_0_dating_white, LR, Image and Self Reported Features: model training ended. AUC: 62.72, accuracy: 62.5
 50%|█████     | 2/4 [00:01<00:01,  1.03it/s]2021-08-18 19:05:01.804 | DEBUG    | __main__:<module>:62 - united kingdom_0_dating_white, NN, Image and Extracted Features: model already

Train on 640 samples, validate on 160 samples
Epoch 1/1


2021-08-18 19:05:05.086 | DEBUG    | __main__:<module>:74 - ethnicity_groups, NN, Image and Self Reported Features: model training ended and model saved.
2021-08-18 19:05:05.200 | DEBUG    | __main__:<module>:82 - canada_1_dating_asian, NN, Image and Self Reported Features: model training ended. AUC: 45.09, accuracy: 68.5
2021-08-18 19:05:05.200 | DEBUG    | __main__:<module>:67 - canada_1_dating_asian, LR, Image and Self Reported Features: model training started.
2021-08-18 19:05:05.322 | DEBUG    | __main__:<module>:74 - ethnicity_groups, LR, Image and Self Reported Features: model training ended and model saved.
2021-08-18 19:05:05.345 | DEBUG    | __main__:<module>:82 - canada_1_dating_asian, LR, Image and Self Reported Features: model training ended. AUC: 51.35, accuracy: 67.0
 50%|█████     | 2/4 [00:01<00:02,  1.01s/it]2021-08-18 19:05:05.346 | DEBUG    | __main__:<module>:62 - canada_1_dating_asian, NN, Image and Extracted Features: model already exists.
2021-08-18 19:05:05.659

In [None]:
# Save results summary to disk

save_results_filepath = Path(RESULTS_DIR + RESULTS_STATS_FILENAME)
utils.save_results(results_array = results, 
                     location = save_results_filepath,
                     columns = RESULTS_COLS)
print(pd.DataFrame(results,columns = RESULTS_COLS))
logger.debug(f"Script for {GROUP_NAME} finished.")

2021-08-18 19:02:33.705 | DEBUG    | utils.utils:save_results:263 - Results Saved.
2021-08-18 19:02:33.710 | DEBUG    | __main__:<module>:8 - Script for ethnicity_groups finished.


                        Group Name Model  \
0    united kingdom_0_dating_black    NN   
1    united kingdom_0_dating_black    LR   
2    united kingdom_0_dating_black    NN   
3    united kingdom_0_dating_black    LR   
4    united kingdom_0_dating_black    NN   
..                             ...   ...   
187       united states_0_fb_black    LR   
188       united states_0_fb_black    NN   
189       united states_0_fb_black    LR   
190       united states_0_fb_black    NN   
191       united states_0_fb_black    LR   

                                     Feature Set  Test AUC  Test Accuracy  
0                                 Image Features     51.74           61.5  
1                                 Image Features     65.86           62.5  
2                   Image and Extracted Features     37.27           61.0  
3                   Image and Extracted Features     52.51           59.5  
4    Image, Self-reported and Extracted Features     53.53           42.5  
..             