In [1]:
import os
import random 
import pandas as pd
import warnings
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from tqdm import tqdm

# local imports
sys.path.append(r"../utils")
import utils

random.seed(1234)
warnings.filterwarnings('ignore')

In [2]:
COLAB = False
DEBUG = True
DATA_DIR = "../data/full/"

if DEBUG:
    DATA_DIR = "../data/sample/"

DATA_DROP_COLS = \
           ['Unnamed: 0', # index columns
            # 'pol',  # label column
            'gender', # self reported and filtered already
            # 'age', # self-reported  
            'country', # self reported and filtered already
            'userid', # index equivalent column 
            'pol_dat_us', # redundant columns with label
            'pol_dat_ca', # redundant columns with label
            'pol_dat_uk', # redundant columns with label
            'pol_fb_us', # redundant columns with label
            'database', # filtered already 
            'ethnicity.value' # filtered already
            ]

In [4]:
if COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
    
  #TODO: untested for COLAB == True, more might be needed here
  DATA_DIR = "/content/drive/Shareddrives/Facial Recognition/data/"

In [5]:
# List of data files

folders = os.listdir(DATA_DIR)
dataset_paths = []
for folder in tqdm(folders):
  csv_files = os.listdir(DATA_DIR + folder)
  for csv in csv_files:
    if '.csv' in csv:
      if DEBUG: print(DATA_DIR + folder + "/" + csv)
      dataset_paths.append(DATA_DIR + folder + "/" + csv)

100%|██████████| 9/9 [00:00<00:00, 2986.21it/s]

../data/sample/US_1_FB/segment_united states_1_fb_white.csv
../data/sample/US_1_FB/segment_united states_1_fb_asian.csv
../data/sample/US_1_FB/segment_united states_1_fb_india.csv
../data/sample/US_1_FB/segment_united states_1_fb_black.csv
../data/sample/UK_1_dating/segment_united kingdom_1_dating_white.csv
../data/sample/UK_1_dating/segment_united kingdom_1_dating_black.csv
../data/sample/UK_1_dating/segment_united kingdom_1_dating_india.csv
../data/sample/UK_1_dating/segment_united kingdom_1_dating_asian.csv
../data/sample/Canada_1_dating/segment_canada_1_dating_india.csv
../data/sample/Canada_1_dating/segment_canada_1_dating_asian.csv
../data/sample/Canada_1_dating/segment_canada_1_dating_black.csv
../data/sample/Canada_1_dating/segment_canada_1_dating_white.csv
../data/sample/Canada_0_dating/segment_canada_0_dating_asian.csv
../data/sample/Canada_0_dating/segment_canada_0_dating_india.csv
../data/sample/Canada_0_dating/segment_canada_0_dating_white.csv
../data/sample/Canada_0_datin




In [2]:
class_distribution = []
results = []

for c, dataset_path in tqdm(enumerate(dataset_paths)):
  data = utils.clean_data(dataset_path, DATA_DROP_COLS)

    label_df = data['pol']

    # image features 
    image_cols = list(map(str, range(1,2049)))
    image_feature_df = data[image_cols]
    img_df = pd.concat([label_df,image_feature_df], axis =1)

    # image and self reported 
    SR_df = data[['age']]
    img_SR_df = pd.concat([img_df, SR_df ], axis =1)

    # image and extracted features 
    extracted_df = data.drop([ 'age','pol'], axis =1)
    cat_cols = extracted_df.select_dtypes(include=['object']).columns
    num_df = pd.get_dummies(extracted_df[list(cat_cols)])
    extracted_df = pd.concat([label_df,extracted_df.drop(cat_cols,axis =1),num_df], axis =1)

    # extracted features and self_reported (whole dataset)
    EX_SR_df = pd.concat([ SR_df, extracted_df ], axis =1)

    """# training the models and getting the values """
    # instantiate models 
    lr = LogisticRegression(penalty='l1',solver="saga")

    # getting accuracies 
    feature_auc, feature_acc, feature_samples = get_accuracy(data =img_df,model=lr)
    # print(feature_auc, feature_acc, feature_samples, sep = " | ")
    res = ["Only Image Features",feature_auc, feature_acc, csv[8:-4]]
    results.append(res)

    # getting accuracies 
    feature_auc, feature_acc, feature_samples = get_accuracy(data =img_SR_df ,model=lr)
    # print(feature_auc, feature_acc, feature_samples, sep = " | ")
    res = ["Image and Self-reported Features",feature_auc, feature_acc, csv[8:-4]]
    results.append(res)

    # getting accuracies 
    feature_auc, feature_acc, feature_samples = get_accuracy(data =extracted_df ,model=lr)
    # print(feature_auc, feature_acc, feature_samples, sep = " | ")
    res = ["Image and Extracted Features",feature_auc, feature_acc,csv[8:-4]]
    results.append(res)

    # getting accuracies 
    feature_auc, feature_acc, feature_samples = get_accuracy(data =EX_SR_df ,model=lr)
    # print(feature_auc, feature_acc, feature_samples, sep = " | ")
    res = ["Image, SR and Extracted Features",feature_auc, feature_acc,csv[8:-4]]
    results.append(res)

    # saving the results 
    save_segment_results(results)

    c+=1
    print(f" Completed {round((c/24)*100 ,2)}% ")