In [28]:
from google.colab import drive
drive.mount('/content/drive')

import os 
os.chdir("/content/drive/Shareddrives/Facial Recognition/exp_variation_self_report_vs_segments")

from utils import get_accuracy, get_dataframe

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve

import os
import random 
random.seed(1234)
import pandas as pd

import warnings
warnings.filterwarnings('ignore')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Getting the data

In [29]:
""" 
Format country_gender_database
"""

segments = ['Canada_1_dating',
 'Canada_0_dating',
 'UK_0_dating',
 'UK_1_dating',
 'US_1_FB',
 'US_1_dating', # The biggest segments 
 'US_0_FB',
 'US_0_dating', # The biggest segments
#  'NO FILES'
 ]

# instantiate which segment to run. 
segment_to_run = "Canada_0_dating"


# join data frames in the folder 
data = get_dataframe(segment_to_run = "Canada_0_dating")

# CLEAN DATA

In [30]:
def clean_data(data):
    # data planning 

    drop_col = ['Unnamed: 0', # index columns
                # 'pol',  # label column
                'gender', # self reported and filtered already
                # 'age', # self-reported  
                'country', # self reported and filtered already
                'userid', # index equivalent column 
                'pol_dat_us', # redundant columns with label
                'pol_dat_ca', # redundant columns with label
                'pol_dat_uk', # redundant columns with label
                'pol_fb_us', # redundant columns with label
                'database', # filtered already 
                # 'ethnicity.value' # filtered already
                ]

    data = data.drop(drop_col,axis =1)

    """# data cleaning """

    # gender replacing with <unk> 
    # replacing other columns with mean values of the age and 5 big personality traits 
    
    # mean value imputing 
    for col in data.loc[:, data.isna().any()].columns:
      data[col] = data[col].fillna(data[col].mean())

    # in case there are columns which has all NAN values still 
    for col in data.loc[:, data.isna().any()].columns:
      data = data.drop(col,axis =1)

    return data

In [31]:
# dropping unnecessary columns and imputing mean values 
data = clean_data(data)

# Segment Preparation

In [32]:
label_df = data['pol'].replace({'liberal':1,'conservative':0})

# image features 
image_feature_df = data[list(map(str, range(1,2049)))]
img_df = pd.concat([label_df,image_feature_df], axis =1)

# image and self reported 
SR_df = data[['age']]
img_SR_df = pd.concat([img_df, SR_df ], axis =1)

# image and extracted features 
extracted_df = data.drop([ 'age','pol'], axis =1)
cat_cols = extracted_df.select_dtypes(include=['object']).columns
if len(cat_cols):
  num_df = pd.get_dummies(extracted_df[list(cat_cols)])
extracted_df = pd.concat([label_df,extracted_df.drop(cat_cols,axis =1),num_df], axis =1)

# extracted features and self_reported (whole dataset)
EX_SR_df = pd.concat([ SR_df, extracted_df ], axis =1)

In [33]:
"""# training the models and getting the values """
# instantiate models 
lr = LogisticRegression(penalty='l1',solver="saga")

In [34]:
results = []

In [35]:
# getting accuracies for Only Image Features
feature_auc, feature_acc, _ = get_accuracy(data =img_df,model=lr)
res = ["Only Image Features",feature_auc, feature_acc,segment_to_run]
results.append(res)

In [36]:
results

[['Only Image Features', 68.0, 63.0, 'Canada_0_dating']]

In [37]:
# getting accuracies for Image and Self-reported Features
feature_auc, feature_acc, _ = get_accuracy(data =img_SR_df ,model=lr)
res = ["Image and Self-reported Features",feature_auc, feature_acc,segment_to_run]
results.append(res)

In [38]:
# getting accuracies for Image and Extracted Features
feature_auc, feature_acc, _ = get_accuracy(data =extracted_df ,model=lr)
res = ["Image and Extracted Features",feature_auc, feature_acc,segment_to_run]
results.append(res)

In [39]:
# getting accuracies  for Image, SR and Extracted Features
feature_auc, feature_acc, feature_samples = get_accuracy(data =EX_SR_df ,model=lr)
res = ["Image, SR and Extracted Features",feature_auc, feature_acc,segment_to_run]
results.append(res)

In [40]:
# saving the results 
results_df = pd.DataFrame(results, columns = ["Features","Test AUC","Test Accuracy","Segment"])
model_name = "LR_"
results_file_loc = "/content/drive/Shareddrives/Facial Recognition/exp_variation_self_report_vs_segments/results/" + model_name + segment_to_run + ".csv"
results_df.to_csv(results_file_loc, index=False)

In [41]:
results_df

Unnamed: 0,Features,Test AUC,Test Accuracy,Segment
0,Only Image Features,68.0,63.0,Canada_0_dating
1,Image and Self-reported Features,69.0,64.0,Canada_0_dating
2,Image and Extracted Features,71.0,66.0,Canada_0_dating
3,"Image, SR and Extracted Features",70.0,65.0,Canada_0_dating
