In [1]:
import pandas as pd

## Monthly count of unique raters

In [2]:
# prepare for load and keeping aggregated data
chunk_size = 1000000
# this will hold the cumulative grouped result
monthly_count = pd.DataFrame(columns=['year', 'month', 'unique_id_count'])


In [9]:
# load previously saved monthly count
with open('monthly_raters.csv', 'r') as f:
    monthly_count = pd.read_csv(f, index_col=0)

In [9]:
# load rating file in chunks
# there are 20 rating files, some ~2.5 GB. Since I can only keep them one at a time I
# change the file name one by one
rating_file = 'ratings/ratings-00001.tsv'
with pd.read_csv(rating_file, sep='\t', chunksize=chunk_size) as reader:
    for chunk in reader:
        
        # add needed columns to this chunk
        chunk['createdAt'] = pd.to_datetime(chunk['createdAtMillis'], unit='ms')
        chunk['year'] = chunk['createdAt'].dt.year
        chunk['month'] = chunk['createdAt'].dt.month
        grouped = chunk.groupby(['year', 'month'])['raterParticipantId'].nunique().reset_index(name='unique_id_count')
        # merge with the running monthly_count
        monthly_count = pd.merge(
            monthly_count, grouped,
            on=['year', 'month'], how='outer', suffixes=('', '_new')
        )

        # sum counts (fillna for missing)
        monthly_count['unique_id_count'] = (
            monthly_count['unique_id_count'].fillna(0)
            + monthly_count['unique_id_count_new'].fillna(0)
        )

        # drop the temporary column
        monthly_count = monthly_count.drop(columns=['unique_id_count_new'])


In [10]:
monthly_count

Unnamed: 0,year,month,unique_id_count
0,2021,1,52.0
1,2021,2,125.0
2,2021,3,128.0
3,2021,4,110.0
4,2021,5,115.0
5,2021,6,356.0
6,2021,7,342.0
7,2021,8,457.0
8,2021,9,711.0
9,2021,10,741.0


In [11]:
# save monthly count
monthly_count.to_csv('monthly_raters.csv')

## Total number of users

In [13]:
with open('data2025raw/userEnrollment-00000.tsv', 'r') as f:
    user_enrollment = pd.read_csv(f, sep='\t')

In [16]:
user_enrollment.tail()

Unnamed: 0,participantId,enrollmentState,successfulRatingNeededToEarnIn,timestampOfLastStateChange,timestampOfLastEarnOut,modelingPopulation,modelingGroup,numberOfTimesEarnedOut
1243150,5E16B1CEF8799859938FFB128862E25E4830477D95B4C4...,newUser,5,1755135794639,1,CORE,25.0,0
1243151,294025B1CDDABBA43CCDEBE5EC956FF224E759E2BEB63B...,newUser,5,1755395100418,1,CORE,25.0,0
1243152,2AA9E0A121D13EC5753A7D1590F37FBCE57E0C3BF78D06...,newUser,5,1755654448283,1,CORE,25.0,0
1243153,6A210F6F33A67ECDD0D1B66320A1EA8AFCC6F5547C9CA7...,newUser,5,1755740555369,1,CORE,9.0,0
1243154,21B97B1025ED60D898235C0A1FF7D06D385CABCC5E42C8...,newUser,5,1755740549425,1,CORE,19.0,0


In [None]:
# all users are in this dataset
# but it's as of 24th of august
f"Total number of users: {len(user_enrollment)}"

'Total number of users: 1243155'

In [18]:
# add a datetime column
user_enrollment['state_change_time'] = \
    pd.to_datetime(user_enrollment['timestampOfLastStateChange'], unit='ms')

In [21]:
# check which part of users who have changed status just signed-up after dec-2024
user_2025 = user_enrollment[user_enrollment['state_change_time'] >= pd.Timestamp(2025, 1, 1)]
user_2025['numberOfTimesEarnedOut'].value_counts()

numberOfTimesEarnedOut
0     282427
1      23709
2       5010
3       1060
4        272
5        104
6         22
7         15
8         11
12         2
9          1
13         1
Name: count, dtype: int64

In [22]:
# check how many users are we talking about who have changed status in 2025
len(user_2025)

312634

Compared to the total user base which is ~1M, 3000 is an acceptable error. As a result, I just count total number of users who changed status before end of 2024 for the paper. Plus, we are reporting this number as XX thousands user!

In [25]:
f"total number of users as of December 2024: \
{len(user_enrollment[user_enrollment['state_change_time'] < pd.Timestamp(2025,1,1)])}"

'total number of users as of December 2024: 930521'