In [1]:
import pandas as pd
import numpy as np

# Calculating sampling target sizes

**Step 1:** With the population counts aggregated in the previous step, we will need to calculate the sample size need to achieve adequite statistical power, using experience of harrassment as the critical variable. We will be using [Cochran's formula](https://www.tarleton.edu/academicassessment/documents/samplesize.pdf) to calculate appropriate sample sizes:

\begin{equation}
n_0=\frac{Z^2pq}{e^2}
\end{equation}
<div style="padding-left: 40%; font-size:12px">
where:<br>
    p = 0.215 (incidence of critical variable)<br>
    q = 1 - p<br>
    e = 0.05 (margin of error)<br>
    Z<sup>2</sup> = 1.96 (for 95% confidence level)
</div><br>

**Step 2:** n<sub>0</sub> needs to be further modified for each project to adjust the sample to the project's population, referred to as Finite Population Correction For Proportions, can achieved using the following equation:

\begin{equation}
n = \frac{n_0}{1 + \frac{(n_0 - 1)}{N}}
\end{equation}
<div style="padding-left: 40%; font-size:12px">
where: N = population size of a strata
</div><br>

**Step 3:** The sample size caclulated for each projects needs to be divided across various editing activity levels. This will be according to the proportions of the population. For example, if the desired sample size for arwiki is 117, and in the population, if 10% of the editors belong to 10-29 strata, the sample size for 10-29 strata on arwiki would be 12.

**Step 4:** Based on previous year's response rates, the sample size needs needs to be adjusted to account for nonresponse. This is capped at strata-level population). For example, 49 of the most-active (1200+) editors on en.wikipedia, and the 2020 response rate was 29%, we'd expect to need to contact 170 such contributors to achieve our target. 

### Step 1: Calculating n<sub>0</sub>

In [2]:
p = 0.215 # indence of critical variable (% experience harrasment)
e = 0.05 # margin of error
Z_square = 1.96 # 95% confidence

nO = (Z_square**2 * p * (1-p))/e**2 #Cochran's formula
print(nO)

259.3464159999999


### Step 2: Adjusting the sample to project's population

In [3]:
# dataframe with population sizes for each strata
strataN = pd.read_csv('definitions/strata-populations.tsv', sep='\t')
strataN['total'] = strataN[strataN.columns[1:]].sum(axis=1)
strataN.head()

Unnamed: 0,project_group,10-29,30-149,150-599,600-1199,1200+,total
0,arwiki,28,60,56,34,98,276
1,asia_wps,12,70,82,49,132,345
2,cee_wps,119,528,614,299,946,2506
3,commons,129,489,536,311,1503,2968
4,dewiki,81,522,689,340,834,2466


In [4]:
project_target_sizes = pd.DataFrame([strataN['project_group'],
                                     strataN['total']
                                     .apply(lambda N: nO/(1+(nO-1)/N))]).transpose()
project_target_sizes.head()

Unnamed: 0,project_group,total
0,arwiki,133.957314
1,asia_wps,148.297083
2,cee_wps,235.108782
3,commons,238.579515
4,dewiki,234.752915


### Step 3: Dividing the sample size across various edit bins

In [5]:
# dataframe with percentage distribution of 
bins_labels = strataN.columns[1:-1]
strataN_perc = strataN.copy()
strataN_perc[bins_labels] = (strataN_perc[bins_labels]
                             .div(strataN_perc[bins_labels].sum(axis=1), axis=0)
                             .multiply(100))
strataN_perc.drop('total', axis=1, inplace=True)
strataN_perc.head()

Unnamed: 0,project_group,10-29,30-149,150-599,600-1199,1200+
0,arwiki,10.144928,21.73913,20.289855,12.318841,35.507246
1,asia_wps,3.478261,20.289855,23.768116,14.202899,38.26087
2,cee_wps,4.748603,21.069433,24.501197,11.931365,37.749401
3,commons,4.346361,16.475741,18.059299,10.478437,50.640162
4,dewiki,3.284672,21.167883,27.939984,13.78751,33.819951


In [6]:
# sampling target sizes by each strata (projects and edit bins)
strata_target_sizes = strataN_perc.copy()
strata_target_sizes[bins_labels] = (strata_target_sizes[bins_labels].multiply(0.01)
                                    .mul(project_target_sizes.total.values, axis=0)
                                    .astype(float))
strata_target_sizes.head()

Unnamed: 0,project_group,10-29,30-149,150-599,600-1199,1200+
0,arwiki,13.589872,29.121155,27.179745,16.501988,47.564554
1,asia_wps,5.158159,30.089263,35.247423,21.062484,56.739753
2,cee_wps,11.164383,49.536088,57.604466,28.051686,88.752158
3,commons,10.369527,39.307744,43.085788,24.999403,120.817052
4,dewiki,7.710862,49.692223,65.589926,32.366582,79.393322


### Step 4: Adjust sampling target sizes to account for non-response

In [7]:
# response rates for each strata; provided GDI based on the previous cycle of the survey
strata_resp_rates = pd.read_csv('secrets/strata-response-rates.tsv', sep='\t').drop('total', axis=1)
strata_resp_rates.head()

Unnamed: 0,project_group,10-29,30-149,150-599,600-1199,1200+
0,arwiki,0.083333,0.038462,0.034483,0.102564,0.042373
1,asia_wps,0.0,0.0,0.080645,0.085714,0.094595
2,cee_wps,0.018519,0.026616,0.046745,0.047337,0.064732
3,commons,0.030488,0.07155,0.075812,0.097222,0.092792
4,dewiki,0.111111,0.097938,0.103841,0.095679,0.12375


In [8]:
strata_sample_pull = strata_resp_rates.copy()

for index in strata_sample_pull.index:
    for e_bin in bins_labels:
        
        # if the response rate is 0; then the population is multiplied by 0.057 to get the target
        if strata_resp_rates.loc[index, e_bin] == 0:
            strata_target = strataN.loc[index, 'total']*0.057
            
            # if a strata's target size is greater than the population size
            # then the target size is capped at population size
            if strata_target > strataN.loc[index, e_bin]:
                strata_sample_pull.loc[index, e_bin] = strataN.loc[index, e_bin]
            else:
                strata_sample_pull.loc[index, e_bin] = strata_target
        
        else:
            # adjusting the sampling target according the response rate
            strata_target = strata_target_sizes.loc[index, e_bin]/strata_resp_rates.loc[index, e_bin]
            
            # if a strata's target size is greater than the population size
            # then the target size is capped at population size            
            if  strata_target > strataN.loc[index, e_bin]:
                strata_sample_pull.loc[index, e_bin] = strataN.loc[index, e_bin]
            else:
                strata_sample_pull.loc[index, e_bin] = strata_target

strata_sample_pull[bins_labels] = strata_sample_pull[bins_labels].apply(np.ceil).astype(int)
strata_sample_pull

Unnamed: 0,project_group,10-29,30-149,150-599,600-1199,1200+
0,arwiki,28,60,56,34,98
1,asia_wps,12,20,82,49,132
2,cee_wps,119,528,614,299,946
3,commons,129,489,536,258,1303
4,dewiki,70,508,632,339,642
5,enwiki,590,1380,854,364,635
6,eswiki,68,328,370,151,358
7,frwiki,103,569,589,223,641
8,itwiki,44,241,252,104,328
9,jawiki,74,534,437,141,227


In [9]:
strata_sample_pull.to_csv('definitions/strata-sample-targets.tsv', sep='\t', index=False)