In [1]:
# Imports

import os
import pandas as pd 
import numpy as np
import sys
from tqdm import tqdm
from loguru import logger
from pathlib import Path
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder

# Local imports
sys.path.append(r"./utils")
from utils import utils

import matplotlib.pyplot as plt 
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [2]:
# Constants

SAMPLE_DATA = False

GROUP_NAME = "EDA"  # TODO : Changed this for ethnicity segments. 
 
DATA_DROP_COLS = \
           ['Unnamed: 0', # index columns
           'userid', # index equivalent column 
            'pol_dat_us', # redundant columns with label
            'pol_dat_ca', # redundant columns with label
            'pol_dat_uk', # redundant columns with label
            'pol_fb_us', # redundant columns with label
            ]

DATA_DIR = "./data/full/"
RESULTS_DIR = f"./results/full/{GROUP_NAME}/" #TODO: ensure that the folder exists

if SAMPLE_DATA:
  DATA_DIR = "./data/sample/"
  RESULTS_DIR = f"./results/sample/{GROUP_NAME}/"
  ASSERT_DATA_SHAPE_0 = 31742
  ASSERT_DATA_SHAPE_1 = 2092
  DATA_DROP_COLS = DATA_DROP_COLS \
                  + ['Unnamed: 0.1'] #TODO: Regenerate sample with index=False and remove this

logger.debug(f"Started the script for {GROUP_NAME}.")

2021-08-18 19:56:20.752 | DEBUG    | __main__:<module>:27 - Started the script for EDA.


In [3]:
# Read datasets

folders = os.listdir(DATA_DIR)
dataframes = []
for folder in tqdm(folders):
  logger.debug(f"In folder {folder}.")
  csv_files = os.listdir(DATA_DIR + folder)
  for csv in csv_files:
    if '.csv' in csv:
      logger.debug(DATA_DIR + folder + "/" + csv)
      df = pd.read_csv(DATA_DIR + folder + "/" + csv)
      dataframes.append(df)

data = pd.concat(dataframes, axis = 0)
del df, dataframes
logger.debug(f"Data size is {data.shape}")
if SAMPLE_DATA: 
  assert (data.shape[0] == ASSERT_DATA_SHAPE_0) and (data.shape[1] == ASSERT_DATA_SHAPE_1), "ERROR: data shape is not correct."

  0%|          | 0/9 [00:00<?, ?it/s]2021-08-18 19:56:20.760 | DEBUG    | __main__:<module>:6 - In folder NO FILES.
2021-08-18 19:56:20.761 | DEBUG    | __main__:<module>:6 - In folder US_1_FB.
2021-08-18 19:56:20.762 | DEBUG    | __main__:<module>:10 - ./data/full/US_1_FB/segment_united states_1_fb_white.csv
2021-08-18 19:56:35.347 | DEBUG    | __main__:<module>:10 - ./data/full/US_1_FB/segment_united states_1_fb_asian.csv
2021-08-18 19:56:36.960 | DEBUG    | __main__:<module>:10 - ./data/full/US_1_FB/segment_united states_1_fb_india.csv
2021-08-18 19:56:37.903 | DEBUG    | __main__:<module>:10 - ./data/full/US_1_FB/segment_united states_1_fb_black.csv
 22%|██▏       | 2/9 [00:21<01:16, 10.88s/it]2021-08-18 19:56:42.514 | DEBUG    | __main__:<module>:6 - In folder UK_1_dating.
2021-08-18 19:56:42.514 | DEBUG    | __main__:<module>:10 - ./data/full/UK_1_dating/segment_united kingdom_1_dating_white.csv
2021-08-18 19:56:48.342 | DEBUG    | __main__:<module>:10 - ./data/full/UK_1_dating/s

In [4]:
# Clean the data
data = data.drop(DATA_DROP_COLS, axis=1)

In [5]:
# converting gender to appropriate label { 0 : " FEMALE", 1 : "MALE"}

data['gender'] = data['gender'].replace({ 0 : " Female", 1 : "Male"})
data['gender.value'] = data['gender.value'].replace({ 0 : " Female", 1 : "male"})


In [21]:
data['ethnicity.value'].isna().any()

False

# Variable Analysis

## For continuous variables

In [11]:
# get all continuous data 
# removing columns with facial feature image_col_names 
image_cols = list(map(str, range(1, 2049)))

non_image_df = data.drop(image_cols  , axis =1)
non_image_df

Unnamed: 0,gender,age,country,facial_hair,pol,ext,neu,ope,agr,con,...,left_eye_status.no_glass_eye_open,left_eye_status.normal_glass_eye_close,left_eye_status.dark_glasses,right_eye_status.normal_glass_eye_open,right_eye_status.no_glass_eye_close,right_eye_status.occlusion,right_eye_status.no_glass_eye_open,right_eye_status.normal_glass_eye_close,right_eye_status.dark_glasses,ethnicity.value
0,MALE,27.79,united states,0.055370,liberal,-0.185271,0.883979,-1.514169,-0.195723,0.960570,...,72.9,0.0,6.2,0.0,0.0,0.0,100.0,0.0,0.0,white
1,MALE,38.13,united states,0.003721,conservative,1.177871,0.009141,-1.192341,-0.888253,-0.256265,...,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,white
2,MALE,27.90,united states,0.068917,liberal,-0.680959,2.196236,1.661204,-2.427208,2.083802,...,99.1,0.0,0.2,2.9,0.0,7.2,89.9,0.0,0.0,white
3,MALE,26.57,united states,0.008976,liberal,0.062573,-0.615743,1.060458,0.342911,-0.092461,...,99.6,0.0,0.1,0.0,0.0,0.1,99.9,0.0,0.0,white
4,MALE,25.85,united states,0.000329,liberal,0.930027,0.134118,0.545532,0.650702,1.030772,...,99.9,0.0,0.0,0.1,0.0,0.0,99.8,0.0,0.1,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1068,FEMALE,,united kingdom,0.010739,liberal,,,,,,...,96.1,0.0,0.0,15.6,0.0,0.0,84.4,0.0,0.0,asian
1069,FEMALE,,united kingdom,0.270348,conservative,,,,,,...,99.8,0.0,0.0,5.5,0.0,0.0,94.3,0.2,0.0,asian
1070,FEMALE,,united kingdom,0.012276,liberal,,,,,,...,58.2,0.0,29.5,3.0,7.0,64.4,25.4,0.0,0.1,asian
1071,FEMALE,,united kingdom,0.002215,liberal,,,,,,...,61.1,0.7,0.0,4.1,92.6,0.0,2.8,0.4,0.0,asian


In [12]:
# getting the numeric and non-numeric dataframe. 
numeric_df = non_image_df.select_dtypes(exclude="object")
nonnumeric_df = non_image_df.select_dtypes(include="object")

In [13]:
######## eda for continuous
def con_eda(continuous_df):
    percentile=np.arange(0,1.01,0.05)
    ds1 = continuous_df.describe().loc[['count','mean','std'],:].transpose()
    ds2 = pd.DataFrame(continuous_df.skew())
    ds3 = pd.DataFrame(continuous_df.kurt())
    ds4 = pd.DataFrame(continuous_df.apply(pd.Series.nunique))
    ds5 = pd.DataFrame(continuous_df.isnull().sum()/continuous_df.shape[0])
    ds6 = pd.merge(ds1,ds2, left_index=True, right_index=True)
    ds7 = pd.merge(ds6,ds3, left_index=True, right_index=True)
    ds8 = pd.merge(ds7,ds4, left_index=True, right_index=True)
    ds9 = pd.merge(ds8,ds5, left_index=True, right_index=True)
    ds_final = ds9
    ds_final.columns = ('count','mean', 'std','skewness','kurtosis', "unique", "missing")
    D6 = ds_final
    return D6

In [14]:
eda_n = con_eda(numeric_df)

eda_n

Unnamed: 0,count,mean,std,skewness,kurtosis,unique,missing
age,417656.0,39.714318,13.294348,0.603427,-0.552153,4762,0.615127
facial_hair,1085179.0,0.142296,0.283961,2.2046,3.331151,1059197,0.0
ext,98417.0,0.118342,1.014186,-0.455896,-0.309414,801,0.909308
neu,98417.0,0.001841,1.014039,0.195307,-0.404931,809,0.909308
ope,98417.0,-0.130006,1.083808,-0.627879,0.2421,654,0.909308
agr,98417.0,-0.017235,1.063482,-0.466996,0.087195,710,0.909308
con,98417.0,0.051058,1.02207,-0.212731,-0.338257,739,0.909308
emotion.sadness,1085179.0,3.167643,13.320732,5.409233,30.343648,1001,0.0
emotion.neutral,1085179.0,19.29093,34.732048,1.533205,0.620536,1001,0.0
emotion.disgust,1085179.0,2.787359,12.166708,5.803144,35.546828,1001,0.0


In [15]:
cor_mat = numeric_df.corr()
plt.figure(figsize=(16,12))
sns.heatmap(cor_mat, cmap='RdYlGn')
plt.savefig(RESULTS_DIR+ 'correlation_plot.png')

# Categorical Data 


In [16]:
#eda for categorical
def cat_eda(cat_df):
    df1 = pd.DataFrame(cat_df.describe().transpose())
    df2 = pd.DataFrame(cat_df.isnull().sum())
    df_final = pd.merge(df1,df2, left_index=True, right_index=True)
    df_final.columns = ('count','unique','top','frequency','missing')
    return df_final

eda_c = cat_eda(nonnumeric_df)

In [17]:
eda_c.head()

Unnamed: 0,count,unique,top,frequency,missing
gender,1085179,2,MALE,704019,0
country,1085179,3,united states,970172,0
pol,1085179,2,conservative,547065,0
database,1085179,2,dating,977777,0
gender.value,1085179,2,MALE,684487,0


In [18]:
###################### Chi-sq test ################################

label_encoder = LabelEncoder()

cat_df_1 = nonnumeric_df.copy()
cin = list(cat_df_1.columns)
for i in cin:
    cat_df_1[i] = label_encoder.fit_transform(cat_df_1[i].astype(str))
    

X = cat_df_1.drop('pol',axis=1)
y = cat_df_1['pol']
chi_scores = chi2(X,y)
p_values = pd.Series(chi_scores[1],index = X.columns)
p_values.sort_values(ascending = False , inplace = True)
print(p_values)

country            3.388566e-284
ethnicity.value     0.000000e+00
gender.value        0.000000e+00
database            0.000000e+00
gender              0.000000e+00
dtype: float64
