# IMPLICIT SEXUAL DISCRIMINATION
### DESCRIPTION
I use the Harvard test IAT to explore potential predictors of implicit sexual discrimination. The IAT association test is a cognitive test used to measure implicit discrimination through associatons (e.g.: associate good/bad inputs with gay/straight inputs) as time reactions. For a deeper explanation, [click here](https://implicit.harvard.edu/implicit/takeatest.html)
### RESEARCH QUESTION
Which individual characteristics predict implicit sexual discrimination? The IAT tests asks the respondents to fill some information about their believes, politics orientations, explicit discrimination and so on. I use these information to predict the measure of implicit discrimination (time reaction to the association test).
### HOW:
I will use:
- 2021 DATASET AS TRAINING DATA
- 2020 DATASET AS TEST SET
Data found at [this link](https://implicit.harvard.edu/implicit/takeatest.html)

## DATA CLEANING

#### IMPORT THE DATA AND CLEAN COLUMNS

In [33]:
## import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [34]:
## OPEN DATA
df1 = pd.read_spss('Sexuality_IAT.public.2020.sav')
df2 = pd.read_spss('Sexuality_IAT.public.2021.sav')
df1['test'], df2['test'] = 0, 1

In [4]:
## See missing values in columns
def sort_missing(df):
    ## sort by number of missing values
    missing, keys, values = {}, [], []
    for i in df.columns:
        keys.append(i)
        values.append(df[f'{i}'].isna().sum())
    missing = dict(zip(keys, values))
    missing = pd.DataFrame(missing.values(), missing.keys())
    missing = missing.iloc[1:, :]
    missing.rename(columns = {0: "missing"}, inplace=True)
    missing.sort_values(by="missing", ascending=True, inplace=True)
    missing.reset_index(inplace=True)
    return missing

sort_missing(df1)

Unnamed: 0,index,missing
0,session_status,0
1,previous_session_schema,0
2,user_id,0
3,broughtwebsite,0
4,occuSelfDetail,0
...,...,...
137,pvd010,477153
138,pvd013,477163
139,pvd012,477172
140,pvd014,477195


In [35]:
df.contactmet_num.value_counts

AttributeError: 'DataFrame' object has no attribute 'contactmet_num'

In [5]:
## define test and training
df1['test'], df2['test'] = 0, 1

## choose variables of interes (see codebook)
columns =   ['weekday', 'birthyear', 'num_002', 'birthSex', 'genderIdentity',
            'sexuality_5', 'ethnicityomb', 'raceomb_002', 'raceombmulti', 'D_biep.Straight_Good_all',
            'Mn_RT_all_3467', 'PCT_error_3467', 'Side_Good_34', 'Side_Straight_34',
            'Tgayleswomen', 'Tgaymen', 'Tstraightmen', 'Tstraightwomen', 'att_7',
            'contactfamily_num', 'contactfriend_num', 'contactfriendly_num',
            'contactmet_num', 'adoptchild', 'marriagerights_3num',
            'relationslegal_3num', 'serverights', 'transgender', 'countrycit_num',
            'edu', 'edu_14', 'politicalid_7', 'occuSelf', 'occuSelfDetail',
            'religion2014', 'religionid', 'test']

## merge the datasets
df1 = pd.DataFrame(df1, columns = columns)
df2 = pd.DataFrame(df2, columns = columns)
df = pd.concat([df1, df2])

## lower case
for i in df.columns:
    df.rename(columns = {f"{i}": f"{i.lower()}"}, inplace=True)


In [6]:
## RENAME COLUMNS
df.rename(columns = {"d_biep.straight_good_all": "iat",
                     "birthyear": "y_birth",
                     "att_7": "prefer_straight",
                     "politicalid_7": "liberal",
                     "religionid": "religious",
                     "genderidentity": "gn_id",
                     "side_straight_34": "straight_first",
                     'contactfamily_num': "fam. member",
                     'contactfriend_num': "friend",
                     'contactfriendly_num': "friendly",
                     'contactmet_num': "met gay",
                     'adoptchild': "adoption",
                     'marriagerights_3num': "marriage",
                     'relationslegal_3num': "relation",
                     'serverights': "work"},
          inplace=True)

#### PREPARE VARIABLES

In [7]:
#Keep only those who do the test for the first time
df['num_002'].unique()
df = df.loc[df['num_002'] == str(1)]
df.drop(['num_002'], axis = 1, inplace = True)

In [8]:
## Transform string to integer
def KeepNumber(col):
    col = col.str.extract('(\d+)')
    return col

num = ['tgayleswomen', 'tgaymen', 'tstraightmen', 'tstraightwomen']
for i in num:
    df[i] = KeepNumber(df[i])

In [9]:
## Generate row means of variables (might add pca)
def row_mean(subset, sample):
    newcol = subset.sum(axis=1) / sample
    return newcol

## gender feeling: being confortable with gay people
df['gender_feel'] = row_mean(df.loc[:, ['tgayleswomen', 'tgaymen', 'tstraightmen', 'tstraightwomen']], 4)

## gender prejudice: explicit gendere prejudice
#df['gender_preg'] = row_mean(df.loc[:, ['adoptchild', 'marriagerights_3num', 'relationslegal_3num',
#                                    'serverights', 'transgender', 'countrycit_num']], 6)

df['gender_feel'] = df['gender_feel'].astype(float)

In [10]:
## Var: Sex assignet at birth
def CleanBirthSex(col):
    col = col.replace({1: "Male", 2: "Female"},
                      inplace = True)
    return col

CleanBirthSex(df['birthsex'])

In [11]:
## RE-CATEGORIZE VARIABLES VALUES AND CLEAN
## give names to categorical values
#df = df.astype('object')

gn = []
for a, b in df['gn_id'].iteritems():
    if len(b)==5 and b[0] =='[' or len(b)==7 and b[0] =='[' or len(b)==9 and b[0] =='[' or len(b)==11 and b[0] =='[' or len(b) == 13 and b[0] == '[':
        gn.append(b[1])
    else:
        gn.append(b)

df['gn_id'] = gn

In [12]:
## Transform gender identity to dummy: binary/non binary
df['gn_id'].replace({'[1]':'M', '[2]':'F', '[3]': 'Trans_M',
                     '[4]': 'Trans_F', '[5]': 'queer', '[6]': 'other',
                     '1':'M', '2':'F', '3': 'Trans_M',
                     '4': 'Trans_F', '5': 'queer', '6': 'other', "": np.nan },
                    inplace=True)
list(df['gn_id'].unique())

['F', 'M', 'queer', 'other', 'Trans_M', nan, 'Trans_F']

In [13]:
def CleanGender1(x):
    if x in ['M', 'F']:
        return 'binary'
    else:
        return 'non_binary'
df['gn_id'] = list(map(CleanGender1, df['gn_id']))

In [14]:
## Var: sexuality to dummy: Straight/non Straight
df.sexuality_5.unique()
def Sexuality(col):
    col.replace({'Heterosexual or Straight': 'Yes',
                 'Bisexual':'No',
                 'Other': 'No',
                 'Lesbian or Gay': 'No'},
                inplace=True)
    return col

df['sexuality_5'] = Sexuality(df['sexuality_5'])
df.rename(columns={'sexuality_5': 'straight'}, inplace=True)

In [15]:
## Var: Attitude towards gay people at work (Against or not)
df.work.unique()
def CleanWork(col):
    col.replace({'Should not be legal': 'Against',
                 'Should be legal':'Pro/neutral',
                 'No opinion': 'Pro/neutral'},
                inplace=True)
    return col
df['work'] = CleanWork(df['work'])

In [16]:
## Var: Trans people should use bathroom rooms of sex assigner at birth: against or not
df.transgender.unique()

def CleanTransgender(col):
    col.replace({'Transgender people should use the bathroom/locker rooms of the sex they were assigned '
                 'at birth': 'Against',
                 'Transgender people should use the bathrooms/locker rooms of their preferred gender identity':'Pro'},
                inplace=True)
    return col
df['transgender'] = CleanTransgender(df['transgender'])

In [17]:
## Var: Marriage between gay people: Against or not
df.marriage.unique()
def CleanMarriage(col):
    col.replace({'Should not be valid': 'Against',
                 'Should be valid': 'Pro/neutral',
                 'No opinion': 'Pro/neutral'},
                inplace=True)
    return col
df['marriage'] = CleanMarriage(df['marriage'])

In [18]:
## Var: Adoption for gay people: Against or not
def CleanAdoption(col):
    col.replace({'Should not be legal': 'Against',
                 'Should be legal': 'Pro/neutral',
                 'No opinion': 'Pro/neutral'},
                inplace=True)
    return col
df['adoption'] = CleanAdoption(df['adoption'])

In [19]:
## Var: Relation between gay people: Against or not
def CleanRelation(col):
    col.replace({'Should not be legal': 'Against',
                 'Should be legal': 'Pro/neutral',
                 'No opinion': 'Pro/neutral'},
                inplace=True)
    return col
df['relation'] = CleanRelation(df['relation'])

In [20]:
## Var: Ethnicity (create categories)
df.raceomb_002.unique()
df.raceomb_002.value_counts()

White                                        125799
Black or African American                     13138
Other or unknown                              10163
Multiracial                                    9208
East Asian                                     7151
South Asian                                    5791
American Indian/Alaska Native                  1410
Native Hawaiian or other Pacific Islander      1081
Name: raceomb_002, dtype: int64

In [21]:
def CleanRace(col):
    col = col.replace({'American Indian/Alaska Native': 'Other or unknown',
                       'Native Hawaiian or other Pacific Islander':'Other or unknown'})
    return col
df['raceomb_002'] = CleanRace(df['raceomb_002'])
df.rename(columns={'raceomb_002': 'race'}, inplace=True)

In [22]:
## Var: Prefer gay or straight people (numerical)
def CleanPrefStraight(col):
    col = col.replace({'I strongly prefer Straight People to Gay People.': 7,
                       'I moderately prefer Straight People to Gay People.': 6,
                       'I slightly prefer Straight People to Gay People.': 5,
                       'I like Straight People and Gay People equally.': 4,
                       'I slightly prefer Gay People to Straight People.': 3,
                       'I moderately prefer Gay People to Straight People.': 2,
                       'I strongly prefer Gay People to Straight People.': 1}
                      )
    return col
df['prefer_straight'] = CleanPrefStraight(df['prefer_straight'])

In [23]:
df.prefer_straight.value_counts()

4    104541
3     19137
5     17040
2     10763
6      9670
7      9643
1      7645
Name: prefer_straight, dtype: int64

In [24]:
## Var: Political orientation: Liberal vs Conservative (numerical)
def CleanPolitics(col):
    col = col.replace({'strongly liberal': 7,
                       'moderately liberal': 6,
                       'slightly liberal': 5,
                       'neutral': 4,
                       'slightly conservative': 3,
                       'moderately conservative': 2,
                       'strongly conservative': 1}
                      )
    return col
df['liberal'] = CleanPolitics(df['liberal'])
df['liberal'] = df['liberal'].astype('float32')

In [25]:
## Var: Religious (numerical)
def CleanReligion(col):
    col = col.replace({'strongly religious': 4,
                       'moderately religious': 3,
                       'slightly religious': 2,
                       'not at all religious': 1}
                      )
    return col
df['religious'] = CleanReligion(df['religious'])
df['religious'] = df['religious'].astype('float32')

In [26]:
## Choose final columns
final_columns = ['y_birth', 'birthsex', 'gn_id', 'straight', 'race', 'iat', 'straight_first',
                 'prefer_straight', 'fam. member', 'friend', 'friendly',
                 'met gay', 'edu', 'liberal', 'religious', 'gender_feel',
                 "adoption", "marriage", "relation", "work", "transgender", 'test']

df = df[final_columns]

In [27]:
## drop null values
df = df.dropna()

#### Split categorical and numerical variables

In [28]:
## numerical and categorical variables
numerical_columns_selector = ['iat', 'y_birth', 'prefer_straight', 'edu', 'liberal', 'religious']

categorical_columns_selector = ['birthsex', 'gn_id', 'straight', 'race', 'straight_first',
                                'fam. member', 'friend', 'friendly', 'met gay', "adoption",
                                "marriage", "relation", "work", "transgender", 'test']

numerical_columns = df.loc[:, numerical_columns_selector]
categorical_columns = df.loc[:, categorical_columns_selector]

In [29]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## categorical columns: transform into dummies
categorical_columns = pd.get_dummies(categorical_columns, drop_first=True)

## numerical columns: standardize
num_names = list(numerical_columns.columns)
numerical_columns = StandardScaler().fit_transform(numerical_columns)
numerical_columns = pd.DataFrame(numerical_columns)
numerical_columns.columns = num_names

In [30]:
## Merge numerical and categorical and save dataframe
df = numerical_columns.merge(categorical_columns, left_index=True, right_index=True)
df = df.dropna()
df.head(10)
df.to_csv('df_ready.csv')
print("done")