In [95]:
import wmfdata as wmf
import pandas as pd
import numpy as np
from wmfdata import spark,hive
from datetime import datetime

In [96]:
# Load data
wikis=pd.read_csv('data/wikis')
file = "https://analytics.wikimedia.org/published/datasets/knowledge_gaps/content_gaps/csv/gender.csv"
gendata = pd.read_csv(file)
# only wikipedia projects
gendata=gendata[gendata.wiki_db.isin(list(wikis['database_code']))]


In [97]:
# Since each snapshot contains patial data of the month it is published it in: we filter the data from previous month if the data was pulled too early. *Needs to be revisited

# Convert 'time_bucket' to datetime for comparison
#gendata['time_bucket'] = pd.to_datetime(gendata['time_bucket'])


# If today's day is before the 23rd of the month, filter out the previous month's data
#if datetime.now().day < 23:
#    prev_month = datetime.now().month - 1 if datetime.now().month > 1 else 12
#    gendata = gendata[gendata['time_bucket'].dt.month != prev_month]

In [98]:
# load original gen_data

old_gen = pd.read_csv('data/updated_gender.csv') 
old_gen['time'] = pd.to_datetime(old_gen['time'])


In [99]:
# Define a function to map categories to the new categories
def map_gender_category(category):
    if category in ['male', 'cisgender male']:
        return 'male'
    elif category in ['female', 'cisgender female']:
        return 'female'
    else:
        return 'gender_diverse'

# Create a new column gender3category
gendata['gender3category'] = gendata['category'].apply(map_gender_category)


In [100]:
#overtime quality articles
gen3quality=gendata.groupby(['gender3category','time_bucket'])['standard_quality_count_value'].sum().reset_index()

# Pivot the dataframe
gen3quality = gen3quality.pivot(index='time_bucket', columns='gender3category', values='standard_quality_count_value').reset_index()
gen3quality = gen3quality[gen3quality['time_bucket'] != '2023-09']


In [101]:
#aggregate
gen3quality['time'] = pd.to_datetime(gen3quality['time_bucket'])

# Identify the latest 'time' in old_geo_data
latest_time = old_gen['time'].max()

# Filter rows from total quality articles where 'time' is greater than latest_time from old_data to identify rows to be added
filtered_rows = gen3quality[gen3quality['time'] > latest_time]



# Filter columns of last_row based on columns in geo_data
old_gen = pd.concat([old_gen, filtered_rows], axis=0, ignore_index=True, sort=False)




In [102]:
# check if new data was appended to the original csv file.

'''
The code below computes the monthly metric and the quarterly metric in the following way:

1) Quarterly metric: Contained in the column called 'proportion_of_non_male'. It is computed as the yoy-difference from 2022 and 2023 (difference calculated over 12 periods) 
and then smoothed out by taking the 3m rolling average.

2) monthly metric: Contained in the column called 'monthly_metric'. It is computed as the yoy-difference for August 2023 and August 2022 (diff by 12 periods).
'''

if filtered_rows.empty:
    print("No data has been added to the original spreadsheet. Will not overwrite updated_gender_data.csv")
    
else:
    
    rolling_yoy_avg = old_gen.copy()
    rolling_yoy_avg = rolling_yoy_avg.drop(columns=['time_bucket']) # use old_gen as output since no new rows were added.
    # Compute yoy difference
    
    # List of underrepresented gender categories
    underrepresented_genders = [
        "female_YoY_difference", 
        "gender_diverse_YoY_difference"
    ]
    
    # Define the columns to adjust to get quarterly average
    gender_columns_to_adjust = [
        'female_YoY_difference',
        'gender_diverse_YoY_difference',
        'men_YoY_difference'
    ]
    

    # Calculate the 12-month YoY difference for each gender category
    rolling_yoy_avg['female_YoY_difference'] = rolling_yoy_avg['female'].diff(12).round(0)
    rolling_yoy_avg['gender_diverse_YoY_difference'] = rolling_yoy_avg['gender_diverse'].diff(12).round(0)
    rolling_yoy_avg['men_YoY_difference'] = rolling_yoy_avg['male'].diff(12).round(0)
    
    monthly = rolling_yoy_avg.copy()

    yoy_gender_columns = [col for col in monthly.columns if 'YoY_difference' in col and '3m' not in col]

    monthly['total'] = monthly[yoy_gender_columns].sum(axis=1)

    monthly['monthly_metric'] = (monthly[underrepresented_genders].sum(axis=1) / monthly['total']) * 100

    rolling_yoy_avg['female_YoY_difference'] = rolling_yoy_avg['female'].diff(12).rolling(window=3).mean().round(0)
    rolling_yoy_avg['gender_diverse_YoY_difference'] = rolling_yoy_avg['gender_diverse'].diff(12).rolling(window=3).mean().round(0)
    rolling_yoy_avg['men_YoY_difference'] = rolling_yoy_avg['male'].diff(12).rolling(window=3).mean().round(0)
    
    # Compute the sum of underrepresented gender categories
    rolling_yoy_avg['underrepresented_gender_sum'] = rolling_yoy_avg[underrepresented_genders].sum(axis=1)

    rolling_yoy_avg['total'] = rolling_yoy_avg[gender_columns_to_adjust].sum(axis=1)

    # Calculate the proportion of underrepresented gender categories to total
    rolling_yoy_avg['proportion_of_non_male'] = (rolling_yoy_avg['underrepresented_gender_sum'] / rolling_yoy_avg['total']) *100

    # drop underrepresented gender sum
    rolling_yoy_avg.drop('underrepresented_gender_sum', axis=1, inplace=True)
    
    rolling_yoy_avg['monthly_metric'] = monthly['monthly_metric']
    
    rolling_yoy_avg = rolling_yoy_avg.drop(columns=['metric_gender_diverse+women'])

    # output
    print("updating csv file")
    rolling_yoy_avg.to_csv('data/updated_gender_data.csv', index=False)


updating csv file


In [103]:
rolling_yoy_avg.tail(3) # View data

Unnamed: 0,time,women YoY difference (3m),gender_diverse_YoY_difference (3m),men YoY difference (3m),female,male,gender_diverse,female_YoY_difference,men_YoY_difference,gender_diverse_YoY_difference,total,proportion_of_non_male,monthly_metric
269,2023-06-01,42467.0,609.0,116082.0,348896,1232367,3929,42467.0,116082.0,609.0,159158.0,27.064929,27.038638
270,2023-07-01,41676.0,599.0,114746.0,352266,1242333,3982,41676.0,114746.0,599.0,157021.0,26.92315,26.654376
271,2023-08-01,,,,355475,1252807,4052,40519.0,113336.0,598.0,154453.0,26.621043,26.155009


# Naive Forecast

In [104]:
'''
The forecast is calculated by getting the difference between the change in the average metric from Q2 2022 and Q3 2022 and then adding that change onto the calculated quarterly metric
for Q2 2023 (the average of July and August).


'''

# Quarterly forecast

# Define the months for q1 and q2
q1_dates = ['2022-07-01', '2022-08-01', '2022-09-01']
q2_dates = ['2022-10-01', '2022-11-01', '2022-12-01']

# Filter the data for q1 and q2 separately
q1_data = rolling_yoy_avg[rolling_yoy_avg['time'].isin(q1_dates)].copy()
q2_data = rolling_yoy_avg[rolling_yoy_avg['time'].isin(q2_dates)].copy()

# Calculate combined YoY difference for female and gender diverse for q1 and q2
q1_data['combined_YoY'] = q1_data['female_YoY_difference'] + q1_data['gender_diverse_YoY_difference']
q2_data['combined_YoY'] = q2_data['female_YoY_difference'] + q2_data['gender_diverse_YoY_difference']

# Compute the average of the combined values and the total for each quarter
avg_combined_q1 = q1_data['combined_YoY'].mean()
avg_combined_q2 = q2_data['combined_YoY'].mean()

avg_total_q1 = q1_data['total'].mean()
avg_total_q2 = q2_data['total'].mean()

# Calculate the proportion for each quarter
prop_q1 = avg_combined_q1 / avg_total_q1
prop_q2 = avg_combined_q2 / avg_total_q2

# Compute the delta between q2 and q1
delta1 = prop_q2 - prop_q1

# Get the last available value of quarterly_metric
last_quarterly_metric = rolling_yoy_avg[rolling_yoy_avg['time'] == '2023-08-01']['proportion_of_non_male'].values[0]

# Forecast for q2 2023
forecast_q2 = last_quarterly_metric + delta1


# Monthly forecast

# Extract the data for August and September 2022
aug_2022_data = rolling_yoy_avg[rolling_yoy_avg['time'] == '2022-08-01'].copy()
sep_2022_data = rolling_yoy_avg[rolling_yoy_avg['time'] == '2022-09-01'].copy()

# Calculate combined YoY difference for female and gender diverse for both months
aug_2022_data['combined_YoY'] = aug_2022_data['female_YoY_difference'] + aug_2022_data['gender_diverse_YoY_difference']
sep_2022_data['combined_YoY'] = sep_2022_data['female_YoY_difference'] + sep_2022_data['gender_diverse_YoY_difference']

# Compute the proportion for both months
prop_aug_2022 = aug_2022_data['combined_YoY'].values[0] / aug_2022_data['total'].values[0]
prop_sep_2022 = sep_2022_data['combined_YoY'].values[0] / sep_2022_data['total'].values[0]

# Compute the delta between September and August 2022
delta2 = prop_sep_2022 - prop_aug_2022

# Get the last available value of quarterly_metric for August 2023
last_monthly_metric = rolling_yoy_avg[rolling_yoy_avg['time'] == '2023-08-01']['monthly_metric'].values[0]

# Forecast for September 2023
forecast_sep_2023 = last_monthly_metric + delta2

print(f"Delta between q1 and q2 2022: {delta1*100:.2f} percentage points")
print(f"Last available quarterly metric (August 2023): {last_quarterly_metric:.2f}%")
print(f"Forecast for Q2: {forecast_q2:.2f}%")



print(f" \nDelta between August and September 2022: {delta2*100:.2f} percentage points")
print(f"Last available monthly metric (August 2023): {last_monthly_metric:.2f}%")
print(f"Forecast for September 2023: {forecast_sep_2023:.2f}%")


Delta between q1 and q2 2022: 0.34 percentage points
Last available quarterly metric (August 2023): 26.62%
Forecast for Q2: 26.62%
 
Delta between August and September 2022: 0.27 percentage points
Last available monthly metric (August 2023): 26.16%
Forecast for September 2023: 26.16%


# Generate monthly chart

In [None]:
%run gender_gap_plot.ipynb