# Summary statistics 

This notebook contains code for the analysis of summary statistics for data used in the project: Public attitudes and ethical guidelines in digital field experiments (digex).

## Set working directory

In [1]:
import pathlib   # Change cwd
import os 

path = pathlib.Path.cwd().parent
os.chdir(path)

## Imports

In [2]:
import pathlib   # Standard library

import yaml   # 3rd party packages
import joypy
import statistics
import pandas as pd
import numpy as np
from skimpy import skim
from matplotlib import pyplot as plt

from digex_src import config    # Local imports
from digex_src import preprocess
from digex_src import get_summary_statistics
from digex_src.load_data import get_data_filepath

## load processed data

In [3]:
processed_data_path = get_data_filepath(
    file=config.PROCESSED_DATA_FILEPATH, 
    data_path=config.PROCESSED_DATA_DIR,
    main=False
) 

digex_df = pd.read_csv(processed_data_path, index_col=0)

digex_df.head()

Unnamed: 0,finished,duration_sec,sm_use,age,gender_id,ethnic_id,edu,politic_views,aware_sm_res,aware_sm_advan,...,rank_anony,rank_harms,rank_balance,rank_pub_interst,rank_add_fac_1,rank_add_fac_1_pos,rank_add_fac_2,rank_add_fac_2_pos,rank_add_fac_3,rank_add_fac_3_pos
1,True,912.0,Facebook,29.0,Male,Asian - Eastern,Highschool,Slightly liberal,Extremely aware,['… are large and can contain millions of data...,...,6.0,4.0,3.0,1.0,,,,,,
2,True,720.0,Twitter,33.0,Male,Mixed race,Highschool,Neutral/ Neither conservative or liberal,Moderately aware,['… are large and can contain millions of data...,...,6.0,1.0,7.0,4.0,,,,,,
3,True,1874.0,Facebook,33.0,Female,Pacific Islander,Bachelor's degree,Very liberal,Extremely aware,['… are large and can contain millions of data...,...,3.0,2.0,4.0,1.0,,,,,,
4,True,1264.0,Facebook,73.0,Female,White / Caucasian,Highschool,Slightly conservative,Moderately aware,['… are large and can contain millions of data...,...,3.0,4.0,5.0,1.0,,8.0,,,,
5,True,556.0,Twitter,27.0,Female,Native-American,Highschool,Very liberal,Extremely aware,['… often capture social relationships not fou...,...,2.0,4.0,6.0,7.0,,,,,,


## Summary statistics

Variables examined: 0-14 (see variable-table.html)

### Overview

In [4]:
skim(digex_df)

### Survey experience 

#### Number of complete survey participants

In [5]:
completed_p = get_summary_statistics.completed_participants(digex_df)
print(completed_p)

499


#### Response rate

In [6]:
response_r = get_summary_statistics.response_rate(digex_df, as_percentage=False)
print(response_r)

499 per 500


In [7]:
response_perc = get_summary_statistics.response_rate(digex_df, as_percentage=True)
print(response_perc,'%')

99.8 %


#### Number of screened out participants

In [8]:
print(config.PARTICIPANT_COUNT - completed_p)

1


#### Completion time

In [9]:
times_min = get_summary_statistics.completion_time(digex_df, time_unit='min')
print(times_min)

count                          499
mean     0 days 00:16:50.851703406
std      0 days 00:09:43.182528457
min                0 days 00:02:30
25%                0 days 00:10:08
50%                0 days 00:14:48
75%                0 days 00:20:39
max                0 days 01:23:47
Name: duration_sec, dtype: object


### Demographic information

Something like this (in a tidy format) will presumably be Table 1 in the paper:

In [10]:
demographic_df = get_summary_statistics.demographic_information(
    digex_df[['age', 'gender_id', 'ethnic_id', 'edu','politic_views']])
display(demographic_df)

Unnamed: 0,age,age_vals,gender_id,gender_id_vals,gender_id_perc,ethnic_id,ethnic_id_vals,ethnic_id_perc,edu,edu_vals,edu_perc,politic_views,politic_views_vals,politic_views_perc
0,Average,41.663327,Male,282.0,56.513,White / Caucasian,397,79.559,Bachelor's degree,222.0,44.489,Very liberal,150.0,30.06
1,Standard deviation,13.635932,Female,207.0,41.483,African-American,32,6.413,Highschool,153.0,30.661,Slightly liberal,126.0,25.251
2,Min,18.0,Non-binary / third gender,8.0,1.603,Mixed race,20,4.008,Master's degree or above,87.0,17.435,Slightly conservative,96.0,19.238
3,Max,78.0,Prefer not to say,2.0,0.401,Hispanic,19,3.808,Associate's degree,22.0,4.409,Neutral/ Neither conservative or liberal,89.0,17.836
4,,,,,,Asian - Eastern,16,3.206,Some college,7.0,1.403,Very conservative,35.0,7.014
5,,,,,,Asian - Indian,7,1.403,Prefer not to say,4.0,0.802,Prefer not to say,3.0,0.601
6,,,,,,Native-American,3,0.601,Vocational training,4.0,0.802,,,
7,,,,,,Pacific Islander,1,0.2,,,,,,
8,,,,,,Prefer not to say,1,0.2,,,,,,
9,,,,,,Asian - Southeast,1,0.2,,,,,,


#### Age

In [11]:
print("Mean age:", digex_df['age'].mean())
print("SD age:", digex_df['age'].std())
print("Minimum age:", digex_df['age'].min())
print("Maximum age:", digex_df['age'].max())

Mean age: 41.66332665330661
SD age: 13.63593166177689
Minimum age: 18.0
Maximum age: 78.0


#### Gender

In [18]:
digex_df['gender_id'].value_counts(normalize=True, dropna=False) * 100

Male                         56.513026
Female                       41.482966
Non-binary / third gender     1.603206
Prefer not to say             0.400802
Name: gender_id, dtype: Float64

#### Ethnicity

In [22]:
digex_df['ethnic_id'].value_counts(normalize=True, dropna=False) * 100

White / Caucasian    79.559118
African-American      6.412826
Mixed race            4.008016
Hispanic              3.807615
Asian - Eastern       3.206413
Asian - Indian        1.402806
Native-American       0.601202
Pacific Islander      0.200401
Prefer not to say     0.200401
Asian - Southeast     0.200401
Carribean             0.200401
Other                 0.200401
Name: ethnic_id, dtype: Float64

#### Social media use

*Note: there's an error on Qualtrics such that participants could only pick one option, instead of multiple. Since this is just a screener, we can simply note that all participants reported being a regular user of at least one of Facebook, Twitter, and/or Reddit*

In [14]:
digex_df['sm_use'].value_counts(dropna=False)

Facebook    258
Reddit      133
Twitter     108
Name: sm_use, dtype: Int64

#### Political views

In [15]:
digex_df['politic_views'].value_counts(dropna=False)

Very liberal                                150
Slightly liberal                            126
Slightly conservative                        96
Neutral/ Neither conservative or liberal     89
Very conservative                            35
Prefer not to say                             3
Name: politic_views, dtype: Int64

In [16]:
# proportion liberal
( (digex_df['politic_views'].value_counts()['Very liberal']) + 
(digex_df['politic_views'].value_counts()['Slightly liberal']) )/len(digex_df)

0.5531062124248497

In [17]:
# proportion conservative
( (digex_df['politic_views'].value_counts()['Very conservative']) + 
(digex_df['politic_views'].value_counts()['Slightly conservative']) )/len(digex_df)

0.2625250501002004