In [70]:
from google.colab import drive
drive.mount('/content/drive')

import os 
os.chdir("/content/drive/Shareddrives/Facial Recognition/exp_variation_self_report_vs_segments")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
import warnings
warnings.filterwarnings('ignore')

In [72]:
from utils import get_dataframe
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense

import pandas as pd

from sklearn import metrics

import random 
random.seed(1234)

# Preparation of Data 

In [73]:
# merging the segments

""" 
Format country_gender_database
"""

segments = ['Canada_1_dating',
 'Canada_0_dating',
 'UK_0_dating',
 'UK_1_dating',
 'US_1_FB',
 'US_1_dating',
 'US_0_FB',
 'US_0_dating',
#  'NO FILES'
 ]

segment_to_run = "Canada_0_dating"


# Get the data frame 

In [74]:
data = get_dataframe(segment_to_run = "Canada_0_dating")

# Prepare Data 

In [75]:
def clean_data(data):
    # data planning 

    drop_col = ['Unnamed: 0', # index columns
                # 'pol',  # label column
                'gender', # self reported and filtered already
                # 'age', # self-reported  
                'country', # self reported and filtered already
                'userid', # index equivalent column 
                'pol_dat_us', # redundant columns with label
                'pol_dat_ca', # redundant columns with label
                'pol_dat_uk', # redundant columns with label
                'pol_fb_us', # redundant columns with label
                'database', # filtered already 
                # 'ethnicity.value' # filtered already
                ]

    data = data.drop(drop_col,axis =1)

    """# data cleaning """

    # gender replacing with <unk> 
    # replacing other columns with mean values of the age and 5 big personality traits 
    
    # mean value imputing 
    for col in data.loc[:, data.isna().any()].columns:
      data[col] = data[col].fillna(data[col].mean())

    # in case there are columns which has all NAN values still 
    for col in data.loc[:, data.isna().any()].columns:
      data = data.drop(col,axis =1)

    return data

In [76]:
# dropping unnecessary columns 
data = clean_data(data)

# Preparing dataframes 

In [77]:
label_df = data['pol']

# image features 
image_feature_df = data[list(map(str, range(1,2049)))]
img_df = pd.concat([label_df,image_feature_df], axis =1)

# image and self reported 
SR_df = data[['age']]
img_SR_df = pd.concat([img_df, SR_df ], axis =1)

# image and extracted features 
extracted_df = data.drop([ 'age','pol'], axis =1)
cat_cols = extracted_df.select_dtypes(include=['object']).columns
if len(cat_cols):
  num_df = pd.get_dummies(extracted_df[list(cat_cols)])
extracted_df = pd.concat([label_df,extracted_df.drop(cat_cols,axis =1),num_df], axis =1)

# extracted features and self_reported (whole dataset)
EX_SR_df = pd.concat([ SR_df, extracted_df ], axis =1)

# instantiate model / Define 

In [78]:
def get_model(dimension_input):
  model = Sequential()
  model.add(Dense(1024, input_dim=dimension_input, activation='relu'))
  model.add(Dense(512, input_dim=1024, activation='relu'))
  model.add(Dense(256, input_dim=512, activation='relu'))
  model.add(Dense(128, input_dim=256, activation='relu'))
  model.add(Dense(60, input_dim=128, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  # Compile model
  model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])

  return model

In [79]:
def get_accuracy(data):
  y = data['pol'].replace({'liberal':1,'conservative':0})
  X = data.drop('pol', axis = 1)

  X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2) 

  model = get_model(X.shape[1])
  results = model.fit(epochs=25,x=X_train,y=y_train,batch_size=1000, verbose=0, validation_split=0.2)

  y_pred = model.predict_proba(X_test)
  auc = round(metrics.roc_auc_score(y_test,y_pred),2)*100

  _, acc = model.evaluate(X_test, y_test,batch_size=1000, verbose=0)

  return auc, round(acc*100,2), data.shape[0]

In [80]:
results = [] # instantiate list to store results 

In [81]:
# getting accuracies 
feature_auc, feature_acc, feature_samples = get_accuracy(data=img_df)
print(feature_auc, feature_acc, feature_samples, sep = " | ")
res = ["Only Image Features",feature_auc, feature_acc, feature_samples]
results.append(res)

70.0 | 64.87 | 25890


In [82]:
results

[['Only Image Features', 70.0, 64.87, 25890]]

In [83]:

# getting accuracies 
feature_auc, feature_acc, feature_samples = get_accuracy(data =img_SR_df)
print(feature_auc, feature_acc, feature_samples, sep = " | ")
res = ["Image and Self-reported Features",feature_auc, feature_acc, feature_samples]
results.append(res)

72.0 | 65.31 | 25890


In [84]:
# getting accuracies 
feature_auc, feature_acc, feature_samples = get_accuracy(data =extracted_df)
print(feature_auc, feature_acc, feature_samples, sep = " | ")
res = ["Image and Extracted Features",feature_auc, feature_acc, feature_samples]
results.append(res)

69.0 | 63.65 | 25890


In [85]:
# getting accuracies 
feature_auc, feature_acc, feature_samples = get_accuracy(data =EX_SR_df)
print(feature_auc, feature_acc, feature_samples, sep = " | ")
res = ["Image, SR and Extracted Features",feature_auc, feature_acc, feature_samples]
results.append(res)

68.0 | 59.48 | 25890


In [86]:
results_df = pd.DataFrame(results, columns = ["Features","Test AUC","Test Accuracy","Samples"])
model_name = "NN_"
results_file_loc = "/content/drive/Shareddrives/Facial Recognition/exp_variation_self_report_vs_segments/results/" + model_name + segment_to_run + ".csv"
results_df.to_csv(results_file_loc, index=False)

In [87]:
results_df

Unnamed: 0,Features,Test AUC,Test Accuracy,Samples
0,Only Image Features,70.0,64.87,25890
1,Image and Self-reported Features,72.0,65.31,25890
2,Image and Extracted Features,69.0,63.65,25890
3,"Image, SR and Extracted Features",68.0,59.48,25890
