# Data Load

In [None]:
import datetime as dt
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
from tqdm import tqdm_notebook
import pickle
import time
from tqdm import tqdm
from sklearn import metrics
import gc
import statsmodels.formula.api as smf
import statsmodels.api as sm
from collections import Counter#<---value count for list
from sklearn.model_selection import StratifiedKFold

In [None]:
#Select the target species
file_id="nutwoo"
bird_name="Nuttall's Woodpecker"
bcr_id='32'

file_id="recwoo"
bird_name="Red-cockaded Woodpecker"
bcr_id='27'

file_id="lewwoo"
bird_name="Lewis’s Woodpecker"
bcr_id='9 and 10'

In [None]:
PATH='/content/drive/My Drive/Colab Notebooks/dissertation/'
ebird_ss=pd.read_csv(PATH+'ebird_ss_'+file_id+'_add30yMonth.csv')

## Define useful functions

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        if col != 'time':
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Drop/fix NaN values

In [None]:
#Check
print((ebird_ss.isna().describe().loc['unique']==2).sort_values())

In [None]:
if file_id=="nutwoo":
    # For Nuttall's Woodpecker
    print(ebird_ss.prec30_cv.isna().value_counts())
    print(ebird_ss.prec180_cv.isna().value_counts())
    print(ebird_ss.observation_count.isna().value_counts())
    ebird_ss.loc[ebird_ss['prec30_cv'].isna(),'prec30_cv']=0
    ebird_ss.loc[ebird_ss['prec180_cv'].isna(),'prec180_cv']=0
    ebird_ss.drop(columns=['observation_count'],inplace=True)

elif file_id=="recwoo":
    # For "Red-cockaded Woodpecker"
    print(ebird_ss.prec30_cv.isna().value_counts())
    print(ebird_ss.elevation_median.isna().value_counts())
    print(ebird_ss.elevation_sd.isna().value_counts())
    print(ebird_ss.observation_count.isna().value_counts())
    ebird_ss.drop(columns=['observation_count'],inplace=True)
    ebird_ss.dropna(inplace=True)
    ebird_ss.reset_index(drop=True,inplace=True)

elif file_id=="lewwoo":
    # For "Lewis’s Woodpecker"
    print(ebird_ss.prec30_cv.isna().value_counts())
    print(ebird_ss.observation_count.isna().value_counts())
    ebird_ss.loc[ebird_ss['prec30_cv'].isna(),'prec30_cv']=0
    ebird_ss.drop(columns=['observation_count'],inplace=True)

else:
    print('Missing file_id')

In [None]:
ebird_ss=reduce_mem_usage(ebird_ss)

## Set variables for climatic data comparison

In [None]:
variables_climatic_long=[
 'bio1', #Annual Mean Temperature
 'bio4', #Temperature Seasonality (standard deviation ×100)
 'bio12', #Annual Precipitation
 'bio15', #Precipitation Seasonality (Coefficient of Variation)
 ]

variables_effort=[
 'time_observations_started',
 'duration_minutes',
 'effort_distance_km',
 'number_observers',
]

variables_climatic_365=[
 'prec365_mean',
 'tmp365_mean',       
 'tmp365_std',
 'prec365_cv',]
              
variables_climatic_730=[
 'prec730_mean',
 'tmp730_mean',
 'tmp730_std',                         
 'prec730_cv',]            

variables_climatic_1095=[
 'prec1095_mean',
 'tmp1095_mean',
 'tmp1095_std',
 'prec1095_cv',]
           
variables_climatic_1460=[
 'prec1460_mean',
 'tmp1460_mean',
 'tmp1460_std',
 'prec1460_cv',]
                          
variables_climatic_1825=[                                          
 'prec1825_mean',
 'tmp1825_mean',
 'tmp1825_std',
 'prec1825_cv',
]

# Random Forest

## Train the model

In [None]:
from sklearn.ensemble import RandomForestClassifier
params={'n_estimators':100,
        'random_state':0,
        'verbose':0,
        'n_jobs':-1,
        'class_weight':'balanced_subsample',
        'max_samples':0.1,
}


splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=72).split(ebird_ss, ebird_ss[['species_observed']]))

variables_list=[['species_observed']+variables_effort,
                ['species_observed']+variables_climatic_long+variables_effort,
                ['species_observed']+variables_effort+variables_climatic_365,
                ['species_observed']+variables_effort+variables_climatic_730,
                ['species_observed']+variables_effort+variables_climatic_1095,
                ['species_observed']+variables_effort+variables_climatic_1460,
                ['species_observed']+variables_effort+variables_climatic_1825,
               ]

variables_labels=['effort','effort + 30 years climatic average',
                 'effort + 1 year climatic average',
                 'effort + 2 year climatic average',
                 'effort + 3 year climatic average',
                 'effort + 4 year climatic average',
                 'effort + 5 year climatic average',
                 ]

df_RF_ls_ls=[]
for i, (train_idx, test_idx) in enumerate(splits):
    print(f'=====Start {i+1}-fold=====')
    df_RF_ls=[]
    for j,(variables,label) in enumerate(zip(variables_list,variables_labels)):
        
        train_x = ebird_ss.iloc[train_idx,:]
        test_x = ebird_ss.iloc[test_idx,:]

        train_x=train_x[variables]
        test_x=test_x[variables]

        train_X=train_x[variables[1:]]
        train_y=train_x['species_observed']
        test_X=test_x[variables[1:]]
        test_y=test_x['species_observed']

        defa2 = RandomForestClassifier(**params)
        defa2.fit(train_X.values, train_y.values)

        pred = defa2.predict_proba(test_X.values)
        tmp1=pd.DataFrame(pred,columns=["absence","prediction"]).drop(columns='absence')
        tmp2=pd.DataFrame(test_y.values.reshape(-1,1),columns=['Actual']).astype(int)
        df_RF=pd.concat([tmp1,tmp2],axis=1)
        df_RF_ls.append(df_RF)

        fpr, tpr, thresholds = metrics.roc_curve(df_RF.Actual.values,df_RF.prediction.values, pos_label=None)
        print(f'AUC of {label} variables:{metrics.auc(fpr, tpr):.4f}')
        
    df_RF_ls_ls.append(df_RF_ls)

## Preparation fpr the comparison

In [None]:
df_converted=pd.DataFrame(df_RF_ls_ls)

In [None]:
for j in range(7):
    for i in range(5):
        if i==0:
            new=pd.concat([df_converted.iloc[i,j].prediction])
        else:
            new=pd.concat([new,df_converted.iloc[i,j].prediction])
    new.reset_index(inplace=True, drop=True)
    if j==0:
        base=pd.DataFrame(new)
    else:
        base = pd.concat([base, new], axis=1)
        
for i in range(5):
    if i==0:
        new=pd.concat([df_converted.iloc[i,j].Actual])
    else:
        new=pd.concat([new,df_converted.iloc[i,j].Actual])
new.reset_index(inplace=True, drop=True)
base = pd.concat([base, new], axis=1)
base.columns=['effort','effort+30y','effort+1y','effort+2y','effort+3y','effort+4y','effort+5y','Actual']

## Compare the AUC

In [None]:
AUC_ls=[]
for i in base.columns[:-1]:
    fpr, tpr, thresholds = metrics.roc_curve(base.Actual.values,base.loc[:,i].values, pos_label=None)
    AUC=metrics.auc(fpr, tpr)
    AUC_ls.append(AUC)
    print(f'AUC of {i}: {AUC:.4f}')

In [None]:
fig, ax=plt.subplots(1,figsize=(5,4))
plt.bar(base.columns[:-1],AUC_ls,color=('C0','C1','C2','C3','C4','C5','C6'))
plt.xlabel('variables',size=12)
plt.ylabel('AUC',size=12)
plt.xticks(rotation=45)
plt.ylim([0.5,1.0])
for index, value in enumerate(AUC_ls):
    plt.text(index-0.45,value+0.005, str(np.round(value,3)),size=12)
fig.savefig(f'clm_comparison_AUC_{file_id}.png',bbox_inches='tight')
plt.show()

## Compare the calibration plots

In [None]:
fig,axes=plt.subplots(1,3,figsize=(15,5))

for label in base.columns[:-1]:
    rf_obs=base.Actual.values.astype(float)
    rf_pred=base[label].values

    bin=0.05
    df_calib=pd.DataFrame([pd.cut(rf_pred, bins=np.arange(0,1+bin,bin),include_lowest=True),rf_obs],
                        columns=('prediction','observed'))

    scatter=axes[0].scatter(np.arange(bin/2,1.00+bin/2,bin),
            df_calib.groupby('prediction').mean()['observed'].values,
                alpha=0.5,label=label)

    axes[0].set_xlabel('Predicted encounter rate')
    axes[0].set_ylabel('Observed encounter rate')
    axes[0].set_title(f'Calibration plot (group size = {bin})')
    axes[0].plot([0,1],[0,1],'k--')


    bin=0.02
    df_calib=pd.DataFrame([pd.cut(rf_pred, bins=np.arange(0,1+bin,bin),include_lowest=False),rf_obs],
                        columns=('prediction','observed'))

    scatter=axes[1].scatter(np.arange(bin/2,1.00+bin/2,bin),
            df_calib.groupby('prediction').mean()['observed'].values,
                alpha=0.5,label=label)
    axes[1].set_xlabel('Predicted encounter rate')
    axes[1].set_ylabel('Observed encounter rate')
    axes[1].set_title(f'Calibration plot (group size = {bin})')
    axes[1].plot([0,1],[0,1],'k--')

    axes[2].plot(np.arange(bin/2,1.00+bin/2,bin),
                df_calib['prediction'].value_counts(normalize=True).sort_index().values,alpha=0.5,marker="*",label=label)
    axes[2].set_xlabel('Predicted encounter rate')
    axes[2].set_ylabel('Frequency (percentage)')
    axes[2].set_title(f'Distribution of the prediction (group size = {bin})')

    axes[0].legend(loc='best')
    axes[1].legend(loc='best')
    axes[2].legend(loc='best')
fig.savefig(f'clm_comparison_calib_{file_id}.png',bbox_inches='tight')
plt.show()