In [1]:
import wmfdata as wmf
import pandas as pd
import numpy as np
from wmfdata import spark,hive
from datetime import datetime




You are using Wmfdata v2.0.0, but v2.0.1 is available.

To update, run `pip install --upgrade git+https://github.com/wikimedia/wmfdata-python.git@release`.

To see the changes, refer to https://github.com/wikimedia/wmfdata-python/blob/release/CHANGELOG.md.


In [3]:
file = "https://analytics.wikimedia.org/published/datasets/knowledge_gaps/content_gaps/csv/geography_wmf_region.csv"
geodata = pd.read_csv(file)

# filter for wikipedia projects
wikis=pd.read_csv('data/wikis')
geodata=geodata[geodata.wiki_db.isin(list(wikis['database_code']))]

In [4]:
# Since each snapshot contains patial data of the month it is published it in: we filter the data from previous month if the data was pulled too early. 

# Convert 'time_bucket' to datetime for comparison
geodata['time_bucket'] = pd.to_datetime(geodata['time_bucket'])


# If today's day is before the 23rd of the month, filter out the previous month's data
if datetime.now().day < 23:
    prev_month = datetime.now().month - 1 if datetime.now().month > 1 else 12
    geodata = geodata[geodata['time_bucket'].dt.month != prev_month]

In [5]:
old_geo_data = pd.read_csv('data/updated_geo_data.csv') # load current geo_data table

old_geo_data['time'] = pd.to_datetime(old_geo_data['time'])

old_geo_data # view data

Unnamed: 0,time,Latin America & Caribbean,Central & Eastern Europe & Central Asia,"East, Southeast Asia, & Pacific",North America,Northern & Western Europe,Middle East & North Africa,Sub-Saharan Africa,South Asia
0,2019-01-01,43211,157277,98605,133819,390489,23763,16709,24872
1,2019-02-01,43507,158527,99658,134408,394303,23932,16854,25078
2,2019-03-01,43749,159813,100624,135045,395822,24118,17098,25245
3,2019-04-01,43925,161123,101544,135608,397749,24336,17522,25631
4,2019-05-01,44251,162442,102600,136348,399776,24541,17689,25815
5,2019-06-01,44575,163607,103453,137104,401924,24796,17866,26347
6,2019-07-01,44884,164846,104400,137724,403831,24941,18035,26638
7,2019-08-01,45186,166190,105317,138491,405853,25105,18173,26908
8,2019-09-01,45470,167283,106107,139104,407689,25295,18294,27073
9,2019-10-01,45727,168942,107117,139983,409961,25524,18468,27440


In [6]:
# Group and pivot the geodata
if 'underrepresented' in old_geo_data.columns:
    old_geo_data.drop('underrepresented', axis=1, inplace=True)
    
quality_totals = geodata.groupby(['time_bucket', 'category'])['standard_quality_count_value'].sum().reset_index()
quality_totals_pivot = quality_totals.pivot(index='time_bucket', columns='category', values='standard_quality_count_value').reset_index()

quality_totals_pivot['time'] = pd.to_datetime(quality_totals_pivot['time_bucket'])

# Identify the latest 'time' in old_geo_data
latest_time = old_geo_data['time'].max()

# Filter rows from total quality articles where 'time' is greater than latest_time from old_data to identify rows to be added
filtered_rows = quality_totals_pivot[quality_totals_pivot['time'] > latest_time]
filtered_rows['total'] = filtered_rows.sum(axis=1)


# Filter columns of last_row based on columns in geo_data
common_columns = [col for col in filtered_rows.columns if col in old_geo_data.columns]

filtered_last_rows = filtered_rows[common_columns]

# Append to geo_data
filtered_last_rows = filtered_last_rows.loc[:, old_geo_data.columns]

old_geo_data = old_geo_data.append(filtered_last_rows, ignore_index=True)

old_geo_data


  filtered_rows['total'] = filtered_rows.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_rows['total'] = filtered_rows.sum(axis=1)
  old_geo_data = old_geo_data.append(filtered_last_rows, ignore_index=True)


Unnamed: 0,time,Latin America & Caribbean,Central & Eastern Europe & Central Asia,"East, Southeast Asia, & Pacific",North America,Northern & Western Europe,Middle East & North Africa,Sub-Saharan Africa,South Asia
0,2019-01-01,43211,157277,98605,133819,390489,23763,16709,24872
1,2019-02-01,43507,158527,99658,134408,394303,23932,16854,25078
2,2019-03-01,43749,159813,100624,135045,395822,24118,17098,25245
3,2019-04-01,43925,161123,101544,135608,397749,24336,17522,25631
4,2019-05-01,44251,162442,102600,136348,399776,24541,17689,25815
5,2019-06-01,44575,163607,103453,137104,401924,24796,17866,26347
6,2019-07-01,44884,164846,104400,137724,403831,24941,18035,26638
7,2019-08-01,45186,166190,105317,138491,405853,25105,18173,26908
8,2019-09-01,45470,167283,106107,139104,407689,25295,18294,27073
9,2019-10-01,45727,168942,107117,139983,409961,25524,18468,27440


In [7]:
if filtered_last_rows.empty:
    print("No data has been changed or added to the original spreadsheet. Will not overwrite updated_gender.csv")
    print(old_geo_data)

else:
    # Compute the Year-over-Year difference
    yoy_difference = old_geo_data.drop(columns='time').diff(periods=12)

    # Compute the 3-month rolling average for the entire dataset
    rolling_avg_yoy = yoy_difference.rolling(window=3).mean()

    # Round every value to the nearest whole number (excluding the 'time' column)
    rolling_avg_yoy = rolling_avg_yoy.round(0)

    # Add the 'time' column back
    rolling_avg_yoy['time'] = old_geo_data['time']

    # Reorder columns to make 'time' the first column
    rolling_avg_yoy = rolling_avg_yoy[['time'] + [col for col in rolling_avg_yoy if col != 'time']]
    
    # Create total column
    columns_to_sum = ["Latin America & Caribbean", "Central & Eastern Europe & Central Asia", 
                      "East, Southeast Asia, & Pacific", "North America", "Northern & Western Europe", 
                      "Middle East & North Africa", "Sub-Saharan Africa", "South Asia"]

    # Compute the sum for the specified columns for the last row
    rolling_avg_yoy['total'] = rolling_avg_yoy[columns_to_sum].sum(axis=1)

    # Create percentages

    # List of underrepresented regions
    underrepresented_regions = [
        "East, Southeast Asia, & Pacific", 
        "Latin America & Caribbean", 
        "Middle East & North Africa", 
        "South Asia", 
        "Sub-Saharan Africa"
    ]

    # Compute the sum of underrepresented regions
    rolling_avg_yoy['underrepresented_sum'] = rolling_avg_yoy[underrepresented_regions].sum(axis=1)

    # Calculate the proportion of underrepresented regions to total
    rolling_avg_yoy['underrepresented'] = rolling_avg_yoy['underrepresented_sum'] / rolling_avg_yoy['total']

    # drop underrepresented sum
    rolling_avg_yoy.drop('underrepresented_sum', axis=1, inplace=True)

    # Save the updated geo_data to a CSV
    print("updating csv file")
    rolling_avg_yoy.to_csv('data/updated_geo_data.csv', index=False)


updating csv file
