# Data Processing

This file is part of the reproduction package provided with the paper Understanding Dark Personality Traits and Strategic Choices in an Inspection Game.

This notebook contains the processing steps for the raw experiment data. 

Questions and correspondence should be addressed to Vinícius Ferraz (visferraz@gmail.com).

In [1]:
import pandas as pd
import numpy as np

In [6]:
# Reading row datasets (otree outputs)
df_full = pd.read_csv('data/repeated_game_raw.csv')
df_aggregated = pd.read_csv('data/repeagted_game_agg.csv')
df_os = pd.read_csv('data/oneshot_game_raw.csv')

In [7]:
# Calculating frequencies of strategy 1
df_full['s1_freq'] = df_full['s1_cumulative'] / df_full['round_number']

In [8]:
# Computing previous round variables for both opponent (binary) and player (frequency)

sorted_data = df_full.sort_values(['game_id', 'round_number'])

sorted_data['opponent_s1_prev'] = np.nan
sorted_data['opponent_s1_freq_prev'] = np.nan

for index, row in sorted_data.iterrows():
    game_id = row['game_id']
    round_number = row['round_number']
    player_id = row['player_id']
    
    if round_number == 1:
        continue 

    prev_round_row = sorted_data[(sorted_data['game_id'] == game_id) & 
                                 (sorted_data['round_number'] == round_number - 1) & 
                                 (sorted_data['player_id'] != player_id)]
    
    if len(prev_round_row) > 0:
        opponent_previous_decision = prev_round_row.iloc[0]['p1s1' if player_id == 2 else 'p2s1']
        opponent_previous_s1_freq = prev_round_row.iloc[0]['s1_freq']
        sorted_data.at[index, 'opponent_s1_prev'] = opponent_previous_decision
        sorted_data.at[index, 'opponent_s1_freq_prev'] = opponent_previous_s1_freq

sorted_data['opponent_s1_prev'].fillna(-1, inplace=True)
sorted_data['opponent_s1_freq_prev'].fillna(0, inplace=True)
df_full_s = sorted_data

In [9]:
# Organizing columns and filling missing values
df_full_s.sort_values(['participant_code', 'round_number'], inplace=True)
df_full_s['s1_freq_previous_round'] = df_full_s.groupby('participant_code')['s1_freq'].shift()
df_full_s['s1_freq_previous_round'].fillna(0, inplace=True)

In [10]:
df_full_s['german_nat'] = (df_full_s['nationality'] == 'Germany').astype(int)

In [11]:
# Assigning binary values to treatments
treat_dict = {0:'unframed', 1:'framed'}

In [12]:
df_os['s1'] = (df_os['strategy'] == 1).astype(int)
df_os["treatment_string"] = df_os["treatment"].map(treat_dict)

df_os = pd.get_dummies(df_os, columns=['treatment_string'], prefix=['treat'], drop_first=False)

In [13]:
# One-hot encoding of categorical variabels (creating dummies)
df_full_d = pd.get_dummies(df_full_s, columns=['gender_str'], prefix=['gender'], drop_first=False)
df_full_d["treatment"] = df_full_d["framed"].map(treat_dict)
df_full_d = pd.get_dummies(df_full_d, columns=['treatment'], prefix=['treat'], drop_first=False)    
df_full_d["treatment"] = df_full_d["framed"].map(treat_dict)
df_aggregated = pd.get_dummies(df_aggregated, columns=['framed'], prefix=['treat'], drop_first=False)    

In [14]:
df_os = pd.get_dummies(df_os, columns=['gender_str'], prefix=['gender'], drop_first=False)

In [15]:
# Retaliation calculation
df_full_d = df_full_d.sort_values(['game_id', 'round_number'])

df_full_d['react_possible_p1'] = ((df_full_d['p1s1'].shift() == 0) & 
                           (df_full_d['p2s1'].shift() == 1) & 
                           (df_full_d['game_id'] == df_full_d['game_id'].shift())).astype(int)

df_full_d['react_possible_p2'] = ((df_full_d['p2s1'].shift() == 0) & 
                           (df_full_d['p1s1'].shift() == 1) & 
                           (df_full_d['game_id'] == df_full_d['game_id'].shift())).astype(int)

df_full_d['p1s1_change'] = ((df_full_d['p1s1'] == 1) & 
                     (df_full_d['p1s1'].shift() == 0) & 
                     (df_full_d['p2s1'].shift() == 1) & 
                     (df_full_d['game_id'] == df_full_d['game_id'].shift())).astype(int)

df_full_d['p2s1_change'] = ((df_full_d['p2s1'] == 1) & 
                     (df_full_d['p2s1'].shift() == 0) & 
                     (df_full_d['p1s1'].shift() == 1) & 
                     (df_full_d['game_id'] == df_full_d['game_id'].shift())).astype(int)

In [16]:
# Dishonesty scores calculation
def calculate_dishonesty_score(df):
    questions = ['q1', 'q2', 'q3', 'q4', 'q5']
    group1 = df[df['coin'] == 0]
    group2 = df[df['coin'] == 1]
    true_proportions_group1 = group1[questions].mean()
    observed_proportions_group2 = group2[questions].mean()
    true_proportions_group2 = np.maximum(2 * (observed_proportions_group2 - 0.25), 0)
    true_proportions = (true_proportions_group1 * len(group1) + true_proportions_group2 * len(group2)) / len(df)
    df['dishonesty_score'] = df[questions].apply(lambda x: np.mean(true_proportions[x == 1]), axis=1)
    df['dishonesty_score'] = df['dishonesty_score'].fillna(0)
    return df

df_os = calculate_dishonesty_score(df_os)

In [17]:
# Encoding of STEM degrees into binary variable 
def classify_degrees(df, column):
    # Definition the STEM degrees
    stem_degrees = [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 18, 26, 27, 28, 29, 30, 34, 35, 37, 38]
    df['STEM'] = df[column].isin(stem_degrees).astype(int)
    df['non_STEM'] = 1 - df['STEM']
    return df

df_full_d = classify_degrees(df_full_d, 'degree')
df_os = classify_degrees(df_os, 'degree')

In [18]:
# Filling empy AGE answers with the meadian of the sample
def age_fill(df, column):
    median = df[df[column] != 7][column].median()

    df[column] = df[column].replace(7, median)

    return df
df_full_d = age_fill(df_full_d, 'age')
df_os = age_fill(df_os, 'age')

In [19]:
# Ensuring proper format of numerical values
df_full_d['treat_framed'] = df_full_d['treat_framed'].astype(int)
df_full_d['gender_Female'] = df_full_d['gender_Female'].astype(int)

df_os['treat_framed'] = df_os['treat_framed'].astype(int)
df_os['gender_Female'] = df_os['gender_Female'].astype(int)

In [20]:
# Renaming columns of interest
os_dict = {
    'DarkFactor':'D-factor',
    'age':'Age', 
    'treat_framed':'Framing', 
    'gender_Female':'Female',
    'dishonesty_score':'Dishonesty Score', 
}
repeated_dict = {
    'DarkFactor':'D-factor',
    'age':'Age', 
    'treat_framed':'Framing', 
    'gender_Female':'Female',
    'dishonesty_score':'Dishonesty Score', 
    'opponent_s1_prev':'Opponent S1 (t-1)', 
    's1_freq_previous_round':'Own S1 Freq. (t-1)'
}

df_full_d = df_full_d.rename(columns=repeated_dict)
df_os = df_os.rename(columns=os_dict)

In [22]:
#Export processed datasets
df_full_d.to_csv('data/df_rep_p.csv')
df_os.to_csv('data/df_os_p.csv')