# Publication Analyses 
This notebook contains analyses and statistics for use in the publication. It is organized by section, and indicates which sections will contain the output.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.append('scripts/')
import statistical_analyses as st

%load_ext autoreload
%autoreload 2

# Aggregate Regional Statistics
Combines country level statistics spreadsheets by region.

In [None]:
# create Brazil (admins will be mix of states and admin2 districts)
st.create_regional_csv(['Acre', 'Alagoas', 'Amapa', 'Amazonas', 'Bahia', 'Ceara', 'Distrito Federal',
                        'Espirito Santo', 'Goias', 'Maranhao', 'Mato Grosso', 'Mato Grosso do Sul',
                        'Minas Gerais', 'Para', 'Paraiba', 'Parana', 'Pernambuco', 'Piaui', 'Rio de Janeiro',
                        'Rio Grande do Norte', 'Rondonia', 'Roraima','Sao Paulo', 'Sergipe', 'Tocantins'],
                        'Brazil')

In [None]:
# create Indonesia
st.create_regional_csv(['Papua', 'Celebes', 'Sumatra', 'Kalimantan', 'Java'],
                       'Indonesia')

In [None]:
# Latin America
st.create_regional_csv(['Argentina', 'Belize', 'Bolivia', 'Brazil', 'Costa Rica', 'Colombia', 'Chile', 
                        'Ecuador', 'El Salvador', 'French Guiana', 'Guatemala', 'Honduras', 'Mexico', 
                        'Nicaragua', 'Panama', 'Paraguay', 'Peru', 'Suriname', 'Venezuela'], 
                        'lat_am')

In [None]:
# Caribbean
st.create_regional_csv(['Caribbean', 'Cuba', 'Dominican Republic', 'Haiti', 'Guyana', 'Jamaica'],
                       'caribbean')

In [None]:
# West Africa
st.create_regional_csv(['Benin', 'Burkina Faso', 'Cameroon', 'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 
                      'Ivory Coast', 'Liberia', 'Mali', 'Mauritania', 'Niger', 'Nigeria',
                      'Senegal', 'Sierra Leone', 'Togo'], 
                      'west_af')

In [None]:
# East Africa
st.create_regional_csv(['Burundi', 'Ethiopia', 'Eritrea', 'Kenya', 'Madagascar', 'Malawi', 'Mozambique',
                        'Rwanda', 'Somalia', 'Tanzania', 'Uganda', 'Zambia', 'Zimbabwe'], 
                        'east_af')

In [None]:
# Central / South Africa
st.create_regional_csv(['Botswana', 'Chad', 'Central African Republic', 'Democratic Republic of the Congo',
                        'Equatorial Guinea', 'Swaziland', 'Gabon', 'Republic of Congo', 'Sudan', 'South Sudan',
                        'Angola', 'Lesotho', 'Namibia', 'South Africa'], 
                        'central_south_af')

In [None]:
# Asia 
st.create_regional_csv(['Australia','Bangladesh', 'Brunei', 'Cambodia', 'China', 'East Timor', 'Fiji',
                        'Hong Kong', 'India', 'India islands', 'Indonesia', 'Laos', 'Malaysia', 'Myanmar', 
                        'New Caledonia','Solomon Islands', 'Singapore', 'Sri Lanka', 'Taiwan', 'Thailand', 
                        'Philippines', 'Papua New Guinea','Vanuatu', 'Vietnam', 'Saudi Arabia', 'Yemen'],
                        'asia')

# Supplementary Index

## Table 1

In [None]:
def si_table1(dst_file, regions):
    
    '''
    For insertion in the supplementary index. Creates a table illustrating the following 
    metrics for cropland and urban areas in each country:
    
    1) Average tree cover 
    2) % land containing >10% tree cover.
    
    Saves table as csv file and returns as df.
    
    '''
    
    table = pd.DataFrame(columns=['region',
                                 'country',
                                 'lcc',
                                 'perc_over10',
                                 'avg'], dtype=object)

    for region in regions:
        
        region_df = pd.read_csv(f'statistics/{region}.csv')

        for country in sorted(set(list(region_df.country.values))):

            # Create an ag df for the country
            country_df = region_df[region_df.country == country]
            ag_ids = [10.0, 11.0, 12.0, 20.0, 30.0, 40.0]
            ag_df = country_df[country_df.esa_id.isin(ag_ids)]

            # Calculate total TML ag ha >10%
            ag_ha = ag_df[['country', 'tree_cover_class', 'tof_ha']].groupby(by=['country', 'tree_cover_class']).sum().reset_index() 
            ag_ha_over10 = sum(ag_ha.tof_ha[1:])

            # Calculate total ha (sum of tof ha == sum of esa sampled)
            ag_sampled = sum(ag_ha.tof_ha) 

            # quick assertion that total is accurate
            ag_sampled_check = ag_df[['country', 'admin', 'esa_id', 'esa_sampled_ha']].drop_duplicates()
            ag_sampled_check = ag_sampled_check.groupby(by=['country']).sum().reset_index() 
            ag_sampled_check = ag_sampled_check.esa_sampled_ha[0]
            
            if ag_sampled != ag_sampled_check:
                print(f'Warning: {country} does not pass ag check')
            
            # % ag land with >10% cover
            ag_perc_over10 = (ag_ha_over10 / ag_sampled) * 100

            # avg tree cover 
            ag_avg = ag_df[['country', 'admin', 'tof_mean']].drop_duplicates(ignore_index=True)
            ag_avg = ag_avg.groupby('country').mean().reset_index()

            table = table.append({'region': region, 
                                'country': country,
                                'lcc': 'Cropland',
                                'perc_over10': round(ag_perc_over10, 2),
                                'avg': round(ag_avg.tof_mean[0], 2)},
                                ignore_index=True)


            # Create an urban df for the country
            urban_ids = [190.0]
            urban_df = country_df[country_df.esa_id.isin(urban_ids)]
            
            # Calculate total TML urban ha >10%
            urban_ha = urban_df[['country', 'tree_cover_class', 'tof_ha']].groupby(by=['country', 'tree_cover_class']).sum().reset_index() 
            urban_ha_over10 = sum(urban_ha.tof_ha[1:])
            
            # total sampled urban ha
            urban_sampled = sum(urban_ha.tof_ha)
            
            # quick assertion that total is accurate
            urban_sampled_check = urban_df[['country', 'admin', 'esa_id', 'esa_sampled_ha']].drop_duplicates()
            urban_sampled_check = urban_sampled_check.groupby(by=['country']).sum().reset_index() 
            urban_sampled_check = urban_sampled_check.esa_sampled_ha[0]

            if urban_sampled != urban_sampled_check:
                print(f'Warning: {country} does not pass urban check')
            
            # % urban land with >10% cover
            urban_perc_over10 = (urban_ha_over10 / urban_sampled) * 100

            # avg tree cover
            urban_avg = urban_df[['country', 'admin', 'tof_mean']].drop_duplicates(ignore_index=True)
            urban_avg = urban_avg.groupby('country').mean().reset_index()

            table = table.append({'region': region, 
                                'country': country,
                                'lcc': 'Urban',
                                'perc_over10': round(urban_perc_over10, 2),
                                'avg': round(urban_avg.tof_mean[0], 2)},
                                ignore_index=True)         

    table.to_csv(dst_file, index=False)

    return table

In [None]:
si = si_table1('statistics/si_table1.csv', ['lat_am', 'caribbean', 'west_af', 'east_af', 'central_south_af', 'asia'])

# Main Paper

## Results

In [None]:
def total_ha(dst_file, regions):
    
    '''
    For insertion of statistics in the "Results" section. Creates a table illustrating 
    the following metrics for cropland and urban areas in each country:

    1) Total ha above 10% 
    
    Saves table as csv file and returns as df.
    
    '''
    
    table = pd.DataFrame(columns=['region', 'country', 'lcc', 'ha_over10', 'ha_total'], dtype=object)

    for region in regions:
        
        region_df = pd.read_csv(f'statistics/{region}.csv')

        for country in sorted(set(list(region_df.country.values))):

            country_df = region_df[region_df.country == country]

            # add cropland (ag) analyses
            ag_ids = [10.0, 11.0, 12.0, 20.0, 30.0, 40.0]
            ag_df = country_df[country_df.esa_id.isin(ag_ids)]
            ag_df = ag_df[['country', 'admin', 'tree_cover_class', 'tof_ha']]
            
            # sum all cropland ha >10
            ag_over10 = ag_df[ag_df.tree_cover_class != '0-9']
            ag_over10 = sum(ag_over10.tof_ha)
            ag_total = sum(ag_df.tof_ha)
           
            table = table.append({'region': region, 
                                'country': country,
                                'lcc': 'Cropland',
                                'ha_over10': ag_over10,
                                'ha_total': ag_total},
                                ignore_index=True)
            
            # add urban analyses
            urban_ids = [190.0]
            urban_df = country_df[country_df.esa_id.isin(urban_ids)]
            urban_df = urban_df[['country', 'admin', 'tree_cover_class', 'tof_ha']]
            
            # sum all urban ha >10
            urban_over10 = urban_df[urban_df.tree_cover_class != '0-9']
            urban_over10 = sum(urban_over10.tof_ha)
            urban_total = sum(urban_df.tof_ha)
           
            table = table.append({'region': region, 
                                'country': country,
                                'lcc': 'Urban',
                                'ha_over10': urban_over10,
                                'ha_total': urban_total},
                                ignore_index=True)
            
    table.to_csv(dst_file, index=False)
    print(f'Total ha of cropland >10%: {table.groupby("lcc")["ha_over10"].sum()[0]}')
    print(f'Total ha of urban land >10%: {table.groupby("lcc")["ha_over10"].sum()[1]}')
    
    return table


In [None]:
results = total_ha('statistics/results_totalha.csv', ['lat_am', 'caribbean', 'west_af', 'east_af', 'central_south_af', 'asia'])

In [None]:
total_ha('statistics/test.csv', ['Costa Rica_statistics_full_tmlonly'])

In [None]:
crop = results[results.lcc == 'Cropland']
sum(crop.ha_total)
total_cropland = 1001823223
round((525597003/total_cropland)*100, 2)

In [None]:
def total_ha_pantropics(dst_file):
    
    table = pd.DataFrame(columns=['region', 'country', 'ha_over10'], dtype=object)
    regions = ['lat_am', 'caribbean', 'west_af', 'east_af', 'central_south_af', 'asia']
    
    for region in regions:
        
        region_df = pd.read_csv(f'statistics/{region}.csv')

        for country in sorted(set(list(region_df.country.values))):

            country_df = region_df[region_df.country == country]
            
            over10 = country_df[country_df.tree_cover_class != '0-9']
            over10 = sum(over10.tof_ha)
            
            table = table.append({'region': region, 
                                 'country': country,
                                 'ha_over10': over10},
                                  ignore_index=True)
    
    table.to_csv(dst_file, index=False)
    print(f'Total ha of land >10% in the tropics: {table.ha_over10.sum()}')

    return table

In [None]:
total = total_ha_pantropics('statistics/results_totalha_pantropics.csv')

## Table 3
The extent of trees on, and tree cover of, urban and cropland within the tropics. 

In [None]:
def main_table3(dst_file):
    
    '''
    Creates a table illustrating the following metrics for cropland and urban areas in each region:
    1) Average tree cover per land cover class.
    2) % cropland and urban land containing >10% tree cover.
    
    Saves table as csv file and returns as df.
    '''
    df = pd.read_csv('statistics/si_table1.csv')
    
    region_avg = df.groupby(['region', 'lcc']).mean().reset_index()
    region_avg = region_avg.round(2)
    
    region_avg.to_csv(dst_file, index=False)
    
    return region_avg

In [None]:
df = main_table3('statistics/main_table3.csv')

In [None]:
df

In [None]:
# calculate average cropland >10% in tropics
cropland = df[df.lcc == 'Cropland']
mean_cropland = cropland.perc_over10.mean()
mean_cropland

In [None]:
# calculate average urban >10% in tropics
urban = df[df.lcc == 'Urban']
mean_urban = urban.perc_over10.mean()
mean_urban

In [None]:
def main_table1(region):
    
    '''
    Produces the following metrics for a given region:
    1) Average tree cover for the entire region
    2) A dataframe containing the avg tree cover for each country in the region
    
    Returns statistic and df.
    '''
    
    df = pd.read_csv(f'statistics/{region}.csv')
    
    # filter df to one line per lcc, per admin, per country
    df = df[['country', 'admin', 'esa_id', 'esa_class', 'tof_mean']].drop_duplicates(ignore_index=True)

    # get the average tree cover per country
    country_avg = df[['country', 'tof_mean']].groupby('country').mean().reset_index()
    
    # get the regional average
    regional_avg = round(country_avg.tof_mean.mean(), 2)
    
    return country_avg, regional_avg


In [None]:
la_country_avg, la_regional_avg = main_table1('lat_am')

In [None]:
la_regional_avg

In [None]:
la_country_avg

In [None]:
df1 = la_country_avg[la_country_avg.country.isin(['Costa Rica', 'Nicaragua', 'El Salvador'])]
df2 = wa_country_avg[wa_country_avg.country == 'Senegal']
df3 = ea_country_avg[ea_country_avg.country == 'Rwanda']
comb2 = pd.concat([df1, df2, df3], ignore_index=True)
comb2.to_csv('statistics/GFWcomp_countryavg.csv', index=False)

In [None]:
wa_country_avg, wa_regional_avg = main_table1('west_af')

In [None]:
wa_regional_avg

In [None]:
wa_country_avg

In [None]:
ea_country_avg, ea_regional_avg = main_table1('east_af')

In [None]:
ea_regional_avg

In [None]:
ea_country_avg

## Discussion

### Agricultural tree cover

In [None]:
# Figure out total tree cover >10%, >20% and >30% for Central America

def zomer_comparison(region):
    
    df = pd.read_csv(f'statistics/{region}.csv')
    
    ag_ids = [10.0, 11.0, 12.0, 20.0, 30.0, 40.0]
    df = df[df.esa_id.isin(ag_ids)]
    
    # get total ha per threshold per country
    df = df.groupby(by=['country', 'tree_cover_class']).sum().reset_index()
    df = df[['country', 'tree_cover_class', 'tof_ha']] 
    
    # remove tree cover classes <10%
    over10 = df[df.tree_cover_class != '0-9']
    over20 = df[(df.tree_cover_class != '0-9') & (df.tree_cover_class != '10-19')]
    over30 = df[(df.tree_cover_class != '0-9') & (df.tree_cover_class != '10-19') & (df.tree_cover_class != '20-29')]
    
    # calculate totals
    total = sum(df.tof_ha)
    over10_ha = sum(over10.tof_ha)
    over10_perc = sum(over10.tof_ha) / total * 100
    over20_perc = sum(over20.tof_ha) / total * 100
    over30_perc = sum(over30.tof_ha) / total * 100

    print(f'Total percentage of ag land in {region} >10%: {round((over10_perc),2)}%')
    print(f'Total percentage of ag land in {region} >20%: {round((over20_perc),2)}%')
    print(f'Total percentage of ag land in {region} >30%:  {round((over30_perc),2)}%')    

    return None

In [None]:
zomer_comparison('central_am')

In [None]:
# which region has the highest percentage of tree cover on cropland
df = pd.read_csv('statistics/main_table3.csv')
top_crop = df[df.lcc == 'Cropland']
top_crop.sort_values(by='perc_over10', ascending=False)

### Urban tree cover

In [None]:
# use SI table to display results highest avg tree cover in urban areas (top 20)
urban = si[si.lcc == 'Urban']
top20 = urban.sort_values(by='perc_over10', ascending=False)[:20]
top20[top20.region == 'asia']

In [None]:
top20[(top20.region == 'lat_am') | (top20.region == 'caribbean')]

## Produce total ha tree cover per 10% threshold

In [None]:
df = pd.read_csv('statistics/lat_am.csv')

In [None]:
# filter to countries of interest
interest = ['Costa Rica', 'El Salvador', 'Nicaragua']
la = df[df.country.isin(interest)]

In [None]:
la = la[['country', 'tree_cover_class', 'tof_ha']]
#la = la.groupby(by=['country','tree_cover_class']).sum()

In [None]:
# same process for Rwanda and Senegal
df = pd.read_csv('statistics/west_af.csv')
sen = df[df.country == 'Senegal']
sen = sen[['country', 'tree_cover_class', 'tof_ha']]
#sen = sen.groupby(by=['country','tree_cover_class']).sum()

In [None]:
df = pd.read_csv('statistics/east_af.csv')
wanda = df[df.country == 'Rwanda']
wanda = wanda[['country', 'tree_cover_class', 'tof_ha']]
#wanda = wanda.groupby(by=['country','tree_cover_class']).sum()

In [None]:
comb = pd.concat([la, sen, wanda], ignore_index=True)
comb = comb.groupby(by=['country','tree_cover_class']).sum()

In [None]:
comb.to_csv('statistics/GFWcomp_totalha.csv')