In [None]:
from google.colab import drive
drive.mount('/content/drive')
import warnings
warnings.filterwarnings('ignore')

import os 
os.chdir("/content/drive/Shareddrives/Facial Recognition/exp_variation_ethinicity_vs_segments")

from keras.models import Sequential
from keras.layers import Dense

import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split

import random 
random.seed(1234)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def clean_data(data):
    # data planning 

    drop_col = ['Unnamed: 0', # index columns
                # 'pol',  # label column
                'gender', # self reported and filtered already
                # 'age', # self-reported  
                'country', # self reported and filtered already
                'userid', # index equivalent column 
                'pol_dat_us', # redundant columns with label
                'pol_dat_ca', # redundant columns with label
                'pol_dat_uk', # redundant columns with label
                'pol_fb_us', # redundant columns with label
                'database', # filtered already 
                'ethnicity.value' # filtered already
                ]

    data = data.drop(drop_col,axis =1)

    """# data cleaning """

    # gender replacing with <unk> 
    # replacing other columns with mean values of the age and 5 big personality traits 
    for col in data.loc[:, data.isna().any()].columns:
        data[col] = data[col].fillna(data[col].mean())

    data = data.drop(data.columns[data.isna().any()].tolist(), axis =1)

    return data

In [None]:
def get_model(dimension_input):
  model = Sequential()
  model.add(Dense(1024, input_dim=dimension_input, activation='relu'))
  model.add(Dense(512, input_dim=1024, activation='relu'))
  model.add(Dense(256, input_dim=512, activation='relu'))
  model.add(Dense(128, input_dim=256, activation='relu'))
  model.add(Dense(60, input_dim=128, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  # Compile model
  model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])

  return model

In [None]:
def get_accuracy(data):
  y = data['pol'].replace({'liberal':1,'conservative':0})
  X = data.drop('pol', axis = 1)

  X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2) 

  model = get_model(X.shape[1])
  results = model.fit(epochs=25,x=X_train,y=y_train,batch_size=128, verbose=0, validation_split=0.2)

  y_pred = model.predict_proba(X_test)
  auc = round(metrics.roc_auc_score(y_test,y_pred),2)*100

  _, acc = model.evaluate(X_test, y_test,batch_size=1000, verbose=0)

  return auc, round(acc*100,2), data.shape[0]

In [None]:
def save_segment_results(arr):
  # saving the results 
  results_df = pd.DataFrame(arr, columns = ["Features","Test AUC","Test Accuracy","Segment"])
  results_file_loc = "/content/drive/Shareddrives/Facial Recognition/exp_variation_ethinicity_vs_segments/results/NN_ethinicity_vs_segments.csv"
  results_df.to_csv(results_file_loc, index=False)
  print(" Segment Results Saved !!")

In [None]:
data_directory = "/content/drive/Shareddrives/Facial Recognition/data/"

folders = os.listdir(data_directory)
results = []
for folder in folders:
  csv_files = os.listdir(data_directory + folder)
  for csv in csv_files:
    file = data_directory+folder+"/"+csv
    df = pd.read_csv(file)
    data = clean_data(df)

    label_df = data['pol']

    # image features 
    image_cols = list(map(str, range(1,2049)))
    image_feature_df = data[image_cols]
    img_df = pd.concat([label_df,image_feature_df], axis =1)

    # image and self reported 
    SR_df = data[['age']]
    img_SR_df = pd.concat([img_df, SR_df ], axis =1)

    # image and extracted features 
    extracted_df = data.drop([ 'age','pol'], axis =1)
    cat_cols = extracted_df.select_dtypes(include=['object']).columns
    num_df = pd.get_dummies(extracted_df[list(cat_cols)])
    extracted_df = pd.concat([label_df,extracted_df.drop(cat_cols,axis =1),num_df], axis =1)

    # extracted features and self_reported (whole dataset)
    EX_SR_df = pd.concat([ SR_df, extracted_df ], axis =1)


    # getting accuracies 
    feature_auc, feature_acc, feature_samples = get_accuracy(data =img_df)
    print(feature_auc, feature_acc, feature_samples, sep = " | ")
    res = ["Only Image Features",feature_auc, feature_acc, csv[8:-4]]
    results.append(res)

    # getting accuracies 
    feature_auc, feature_acc, feature_samples = get_accuracy(data =img_SR_df)
    print(feature_auc, feature_acc, feature_samples, sep = " | ")
    res = ["Image and Self-reported Features",feature_auc, feature_acc, csv[8:-4]]
    results.append(res)

    # getting accuracies 
    feature_auc, feature_acc, feature_samples = get_accuracy(data=extracted_df)
    print(feature_auc, feature_acc, feature_samples, sep = " | ")
    res = ["Image and Extracted Features",feature_auc, feature_acc,csv[8:-4]]
    results.append(res)

    # getting accuracies 
    feature_auc, feature_acc, feature_samples = get_accuracy(data =EX_SR_df)
    print(feature_auc, feature_acc, feature_samples, sep = " | ")
    res = ["Image, SR and Extracted Features",feature_auc, feature_acc,csv[8:-4]]
    results.append(res)

    # saving the results 
    save_segment_results(results)

62.0 | 74.82 | 6272
61.0 | 76.89 | 6272
63.0 | 76.57 | 6272
62.0 | 76.02 | 6272
 Segment Results Saved !!
64.0 | 78.97 | 1352


KeyboardInterrupt: ignored