In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os 
os.chdir("/content/drive/Shareddrives/Facial Recognition/whole dataset/")

from utils import get_accuracy

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve

import os
import random 
random.seed(1234)
import pandas as pd

import warnings
warnings.filterwarnings('ignore')


Mounted at /content/drive


# Getting the data

In [None]:
def clean_data(data):
    # data planning 

  drop_col = ['Unnamed: 0', # index columns
              # 'pol',  # label column
              # 'gender', # self reported and filtered already
              # 'age', # self-reported  
              # 'country', # self reported and filtered already
              'userid', # index equivalent column 
              'pol_dat_us', # redundant columns with label
              'pol_dat_ca', # redundant columns with label
              'pol_dat_uk', # redundant columns with label
              'pol_fb_us', # redundant columns with label
              # 'database', # filtered already 
              # 'ethnicity.value' # filtered already
              ]

  data = data.drop(drop_col,axis =1)

  """# data cleaning """

  # gender replacing with <unk> 
  # replacing other columns with mean values of the age and 5 big personality traits 
  for col in data.loc[:, data.isna().any()].columns:
      data[col] = data[col].fillna(data[col].mean())

  data = data.drop(data.columns[data.isna().any()].tolist(), axis =1)

  return data

In [None]:
def save_results(arr):
  # saving the results 
  results_df = pd.DataFrame(arr, columns = ["Features","Test AUC","Test Accuracy"])
  results_file_loc = "/content/drive/Shareddrives/Facial Recognition/whole dataset/results/LR_complete.csv"
  results_df.to_csv(results_file_loc, index=False)
  print("Results Saved !!")


In [None]:
data_directory = "/content/drive/Shareddrives/Facial Recognition/data/"

folders = os.listdir(data_directory)
results = []
for folder in folders:
  csv_files = os.listdir(data_directory + folder)
  for csv in csv_files:
    if csv[-3:] == "csv":
      file = data_directory+folder+"/"+csv
      df = pd.read_csv(file)
      results.append(df)

data = pd.concat(results, axis=0)

In [None]:
# drop columns and missing value handling
data = clean_data(df)

In [None]:
label_df = data['pol']

# image features 
image_cols = list(map(str, range(1,2049)))
image_feature_df = data[image_cols]
img_df = pd.concat([label_df,image_feature_df], axis =1)

# image and self reported 
SR_df = data[['age','gender','country']]
cat_cols = SR_df.select_dtypes(include=['object']).columns
num_df = pd.get_dummies(SR_df[list(cat_cols)])
SR_df = pd.concat([SR_df.drop(cat_cols,axis =1),num_df],axis =1)
img_SR_df = pd.concat([img_df, SR_df ], axis =1)

# image and extracted features 
extracted_df = data.drop([ 'age','gender','country','pol'], axis =1)
cat_cols = extracted_df.select_dtypes(include=['object']).columns
num_df = pd.get_dummies(extracted_df[list(cat_cols)])
extracted_df = pd.concat([label_df,extracted_df.drop(cat_cols,axis =1),num_df], axis =1)

# extracted features and self_reported (whole dataset)
EX_SR_df = pd.concat([ SR_df, extracted_df ], axis =1)

In [None]:
lr = LogisticRegression(penalty='l1',solver="saga")

In [None]:
results = []

In [None]:
# getting accuracies 
feature_auc, feature_acc, feature_samples = get_accuracy(data =img_df, model = lr)
print(feature_auc, feature_acc, feature_samples, sep = " | ")
res = ["Only Image Features",feature_auc, feature_acc]
results.append(res)

# getting accuracies 
feature_auc, feature_acc, feature_samples = get_accuracy(data =img_SR_df, model = lr)
print(feature_auc, feature_acc, feature_samples, sep = " | ")
res = ["Image and Self-reported Features",feature_auc, feature_acc]
results.append(res)

# getting accuracies 
feature_auc, feature_acc, feature_samples = get_accuracy(data=extracted_df, model = lr)
print(feature_auc, feature_acc, feature_samples, sep = " | ")
res = ["Image and Extracted Features",feature_auc, feature_acc]
results.append(res)

# getting accuracies 
feature_auc, feature_acc, feature_samples = get_accuracy(data =EX_SR_df, model = lr)
print(feature_auc, feature_acc, feature_samples, sep = " | ")
res = ["Image, SR and Extracted Features",feature_auc, feature_acc]
results.append(res)

# saving the results 
save_results(results)

64.0 | 60.0 | 16992
64.0 | 59.0 | 16992
68.0 | 63.0 | 16992
63.0 | 60.0 | 16992
Results Saved !!
