In [103]:
import wmfdata as wmf
import pandas as pd
import numpy as np
from wmfdata import spark,hive
from datetime import datetime

In [105]:
# Load data
wikis=pd.read_csv('data/wikis')
file = "https://analytics.wikimedia.org/published/datasets/knowledge_gaps/content_gaps/csv/gender.csv"
gendata = pd.read_csv(file)
# only wikipedia projects
gendata=gendata[gendata.wiki_db.isin(list(wikis['database_code']))]
gendata

Unnamed: 0,wiki_db,category,time_bucket,article_created_value,pageviews_sum_value,pageviews_mean_value,standard_quality_value,standard_quality_count_value,quality_score_value,revision_count_value,...,standard_quality_count_all_categories,quality_score_all_categories,revision_count_all_categories,article_created_all_wikis,pageviews_sum_all_wikis,pageviews_mean_all_wikis,standard_quality_all_wikis,standard_quality_count_all_wikis,quality_score_all_wikis,revision_count_all_wikis
0,azbwiki,agender,2001-01,0,0,0.000,0.0,0,0.000000,0,...,0,0.000000,0,0,0,0.000000,0.000000,0,0.000000,0
1,gdwiki,female,2001-01,0,0,0.000,0.0,0,0.000000,0,...,0,0.000000,0,6,0,0.000000,0.000000,0,0.014345,4
2,azbwiki,agender,2001-02,0,0,0.000,0.0,0,0.000000,0,...,0,0.000000,0,0,0,0.000000,0.000000,0,0.000000,0
3,gdwiki,female,2001-02,0,0,0.000,0.0,0,0.000000,0,...,0,0.000000,0,4,0,0.000000,0.000000,0,0.085508,34
4,azbwiki,agender,2001-03,0,0,0.000,0.0,0,0.000000,0,...,0,0.000000,0,0,0,0.000000,0.000000,0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579063,gcrwiki,male,2023-05,1,3209,25.672,0.0,0,0.200151,41,...,0,0.199124,42,28249,3297988323,385.044924,0.123873,1223615,0.315455,2764796
579064,gcrwiki,male,2023-06,0,4801,38.408,0.0,0,0.200151,0,...,0,0.199124,0,27984,3139047243,340.698627,0.124398,1232358,0.315742,2203796
579065,gcrwiki,male,2023-07,0,5349,42.792,0.0,0,0.199923,1,...,0,0.198905,1,29551,3272912280,364.126481,0.125022,1242324,0.316091,2574417
579066,gcrwiki,male,2023-08,0,3876,31.008,0.0,0,0.200222,11,...,0,0.199193,11,29767,3321054560,363.832844,0.125686,1252798,0.316381,2303187


In [106]:
# Since each snapshot contains patial data of the month it is published it in: we filter the data from previous month if the data was pulled too early. 

# Convert 'time_bucket' to datetime for comparison
gendata['time_bucket'] = pd.to_datetime(gendata['time_bucket'])


# If today's day is before the 23rd of the month, filter out the previous month's data
if datetime.now().day < 23:
    prev_month = datetime.now().month - 1 if datetime.now().month > 1 else 12
    gendata = gendata[gendata['time_bucket'].dt.month != prev_month]

In [107]:
# aggregate

old_gen = pd.read_csv('data/updated_gender.csv') # load original gen_data
old_gen['time'] = pd.to_datetime(old_gen['time'])
old_gen # view data

Unnamed: 0,time,women YoY difference (3mo),non-binary YoY difference (3mo),men YoY difference (3mo),female,male,gender_diverse
0,2001-01-01,,,,0,0,0
1,2001-02-01,,,,0,0,0
2,2001-03-01,,,,0,0,0
3,2001-04-01,,,,0,0,0
4,2001-05-01,,,,0,0,0
...,...,...,...,...,...,...,...
265,2023-02-01,,,,335376,1196765,3658
266,2023-03-01,,,,339384,1205254,3722
267,2023-04-01,,,,342734,1213848,3778
268,2023-05-01,,,,345829,1222847,3826


In [108]:
# Define a function to map categories to the new categories
def map_gender_category(category):
    if category in ['male', 'cisgender male']:
        return 'male'
    elif category in ['female', 'cisgender female']:
        return 'female'
    else:
        return 'gender_diverse'

# Create a new column gender3category
gendata['gender3category'] = gendata['category'].apply(map_gender_category)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gendata['gender3category'] = gendata['category'].apply(map_gender_category)


In [109]:
#overtime quality articles
gen3quality=gendata.groupby(['gender3category','time_bucket'])['standard_quality_count_value'].sum().reset_index()

# Pivot the dataframe
gen3quality = gen3quality.pivot(index='time_bucket', columns='gender3category', values='standard_quality_count_value').reset_index()
gen3quality

gender3category,time_bucket,female,gender_diverse,male
0,2001-01-01,0,0,0
1,2001-02-01,0,0,0
2,2001-03-01,0,0,0
3,2001-04-01,0,0,0
4,2001-05-01,0,0,0
...,...,...,...,...
245,2023-04-01,342854,3819,1214538
246,2023-05-01,345976,3868,1223624
247,2023-06-01,348896,3929,1232367
248,2023-07-01,352266,3982,1242333


In [110]:
#aggregate
gen3quality['time'] = pd.to_datetime(gen3quality['time_bucket'])

# Identify the latest 'time' in old_geo_data
latest_time = old_gen['time'].max()

# Filter rows from total quality articles where 'time' is greater than latest_time from old_data to identify rows to be added
filtered_rows = gen3quality[gen3quality['time'] > latest_time]



# Filter columns of last_row based on columns in the original gen data
common_columns = [col for col in filtered_rows.columns if col in old_gen.columns]

filtered_last_rows = filtered_rows[common_columns]


# Append to original gen data
old_gen_data = old_gen.append(filtered_last_rows, ignore_index=True)

old_gen_data.to_csv('old_gen_data.csv')
old_gen_data

  old_gen_data = old_gen.append(filtered_last_rows, ignore_index=True)


Unnamed: 0,time,women YoY difference (3mo),non-binary YoY difference (3mo),men YoY difference (3mo),female,male,gender_diverse
0,2001-01-01,,,,0,0,0
1,2001-02-01,,,,0,0,0
2,2001-03-01,,,,0,0,0
3,2001-04-01,,,,0,0,0
4,2001-05-01,,,,0,0,0
...,...,...,...,...,...,...,...
267,2023-04-01,,,,342734,1213848,3778
268,2023-05-01,,,,345829,1222847,3826
269,2023-06-01,,,,347455,1228506,3858
270,2023-07-01,,,,352266,1242333,3982


In [111]:
#aggregate

if filtered_last_rows.empty:
    print("No data has been changed or added to the original spreadsheet. Will not overwrite updated_gender.csv")
    #rolling_avg_yoy.to_csv('updated_gender.csv')
    print(rolling_avg_yoy)
    
else:
    # Compute the Year-over-Year difference
    rolling_avg_yoy = old_gen_data.copy()

    # Compute the 12-month YoY difference for "female" column
    rolling_avg_yoy['female_YoY_difference'] = rolling_avg_yoy['female'].diff(12)

    # Compute the 3-month rolling average of the YoY difference for "female"
    rolling_avg_yoy['women YoY difference (3mo)'] = rolling_avg_yoy['female_YoY_difference'].rolling(window=3).mean()


    # Compute the 12-month YoY difference for men column
    rolling_avg_yoy['men_YoY_difference'] = rolling_avg_yoy['male'].diff(12)

    # Compute the 3-month rolling average of the YoY difference for  men
    rolling_avg_yoy['men YoY difference (3mo)'] = rolling_avg_yoy['men_YoY_difference'].rolling(window=3).mean()

    # Compute the 12-month YoY difference for gender diverse column
    rolling_avg_yoy['gender_diverse_YoY_difference'] = rolling_avg_yoy['gender_diverse'].diff(12)

    # Compute the 3-month rolling average of the YoY difference for gender_diverse
    rolling_avg_yoy['non-binary YoY difference (3mo)'] = rolling_avg_yoy['gender_diverse_YoY_difference'].rolling(window=3).mean()

    # Get total
    columns_to_sum = ['women YoY difference (3mo)', 'men YoY difference (3mo)', 'non-binary YoY difference (3mo)']
    rolling_avg_yoy['total'] = rolling_avg_yoy[columns_to_sum].sum(axis=1)

    # create proportion column to capture women + gender diverse
    rolling_avg_yoy['proportion_of_non_male'] = rolling_avg_yoy[['women YoY difference (3mo)','non-binary YoY difference (3mo)']].sum(axis=1)/rolling_avg_yoy['total']

    # Reorder columns to make 'time' the first column
    rolling_avg_yoy = rolling_avg_yoy[['time'] + [col for col in rolling_avg_yoy if col != 'time']]
    rolling_avg_yoy = rolling_avg_yoy.rename(columns = {'non-binary YoY difference (3mo)':'gender-diverse YoY difference (3mo)'})

   
    # Round the columns
    columns_to_round = [col for col in rolling_avg_yoy.columns if col not in ['proportion_of_non_male', 'time']]
    for col in columns_to_round:
        rolling_avg_yoy[col] = rolling_avg_yoy[col].round(0)

    #output
    
    rolling_avg_yoy.to_csv('data/updated_gender.csv', index=False)
    print("updating csv file")

creating new dataframe
          time  women YoY difference (3mo)  \
0   2001-01-01                         NaN   
1   2001-02-01                         NaN   
2   2001-03-01                         NaN   
3   2001-04-01                         NaN   
4   2001-05-01                         NaN   
..         ...                         ...   
267 2023-04-01                     42817.0   
268 2023-05-01                     42665.0   
269 2023-06-01                     41905.0   
270 2023-07-01                     41162.0   
271 2023-08-01                     40061.0   

     gender-diverse YoY difference (3mo)  men YoY difference (3mo)  female  \
0                                    NaN                       NaN       0   
1                                    NaN                       NaN       0   
2                                    NaN                       NaN       0   
3                                    NaN                       NaN       0   
4                                 