## PREPARING DATASET:
#### 2021 DATASET AS TRAINING DATA
#### 2020 DATASET AS TEST SET

## IMPORT THE DATA AND CLEAN COLUMNS

In [26]:
## import libraries
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


In [27]:
## OPEN DATA
df1 = pd.read_spss('Sexuality_IAT.public.2020.sav')
df2 = pd.read_spss('Sexuality_IAT.public.2021.sav')
df1['test'], df2['test'] = 0, 1

In [14]:
## See missing values in columns
def sort_missing(df):
    ## sort by number of missing values
    missing, keys, values = {}, [], []
    for i in df.columns:
        keys.append(i)
        values.append(df[f'{i}'].isna().sum())
    missing = dict(zip(keys, values))
    missing = pd.DataFrame(missing.values(), missing.keys())
    missing = missing.iloc[1:, :]
    missing.rename(columns = {0: "missing"}, inplace=True)
    missing.sort_values(by="missing", ascending=True, inplace=True)
    missing.reset_index(inplace=True)
    return missing

sort_missing(df1)

Unnamed: 0,index,missing
0,session_status,0
1,previous_session_schema,0
2,user_id,0
3,broughtwebsite,0
4,occuSelfDetail,0
...,...,...
137,pvd010,477153
138,pvd013,477163
139,pvd012,477172
140,pvd014,477195


In [19]:
## define test and training
df1['test'], df2['test'] = 0, 1

## choose variables of interes (see codebook)
columns =   ['weekday', 'birthyear', 'num_002', 'birthSex', 'genderIdentity',
            'sexuality_5', 'ethnicityomb', 'raceomb_002', 'raceombmulti', 'D_biep.Straight_Good_all',
            'Mn_RT_all_3467', 'PCT_error_3467', 'Side_Good_34', 'Side_Straight_34',
            'Tgayleswomen', 'Tgaymen', 'Tstraightmen', 'Tstraightwomen', 'att_7',
            'contactfamily_num', 'contactfriend_num', 'contactfriendly_num',
            'contactmet_num', 'adoptchild', 'marriagerights_3num',
            'relationslegal_3num', 'serverights', 'transgender', 'countrycit_num',
            'edu', 'edu_14', 'politicalid_7', 'occuSelf', 'occuSelfDetail',
            'religion2014', 'religionid']

## merge the datasets
df1 = pd.DataFrame(df1, columns = columns)
df2 = pd.DataFrame(df2, columns = columns)
df = pd.concat([df1, df2])

## clean columns names
df.rename(columns = {"D_biep.Straight_Good_all": "iat",
                         "birthyear": "y_birth",
                         "att_7": "prefer_straight",
                         "politicalid_7": "liberal",
                         "religionid": "religious",},
          inplace=True)

## lower case
for i in df.columns:
    df.rename(columns = {f"{i}": f"{i.lower()}"}, inplace=True)

## PREPARE VARIABLES

In [23]:
## Generate row means of variables (might add pca)
def row_mean(subset, sample):
    newcol = subset.sum(axis=1) / sample
    return newcol

## gender feeling: being confortable with gay people
df['gender_feel'] = row_mean(df.loc[:, ['tgayleswomen', 'tgaymen', 'tstraightmen', 'tstraightwomen']], 4)

## gender prejudice: explicit gendere prejudice
df['gender_preg'] = row_mean(df.loc[:, ['adoptchild', 'marriagerights_3num', 'relationslegal_3num',
                                    'serverights', 'transgender', 'countrycit_num']], 6)


In [24]:
## RECATEGORIZE VARIABLES VALUES AND CLEAN
## give names to categorical values
df = df.astype('object')
df['gn_id'].replace({'[1]':'M', '[2]':'F', '[3]': 'Trans_M',
                         '[4]': 'Trans_F', '[5]': 'queer', '[6]': 'other' },
                        inplace=True)


KeyError: 'gn_id'

In [25]:
for i in df:
    print(i)

weekday
birthyear
num_002
birthsex
genderidentity
sexuality_5
ethnicityomb
raceomb_002
raceombmulti
d_biep.straight_good_all
mn_rt_all_3467
pct_error_3467
side_good_34
side_straight_34
tgayleswomen
tgaymen
tstraightmen
tstraightwomen
att_7
contactfamily_num
contactfriend_num
contactfriendly_num
contactmet_num
adoptchild
marriagerights_3num
relationslegal_3num
serverights
transgender
countrycit_num
edu
edu_14
politicalid_7
occuself
occuselfdetail
religion2014
religionid
gender_feel
gender_preg
