# Data Load

In [None]:
import datetime as dt 
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
from tqdm import tqdm_notebook
import pickle
import time
from tqdm import tqdm
from sklearn import metrics
import gc
import statsmodels.formula.api as smf
import statsmodels.api as sm
from collections import Counter#<---value count for list
from sklearn.model_selection import StratifiedKFold

In [None]:
#Select the target species
file_id="nutwoo"
bird_name="Nuttall's Woodpecker"
bcr_id='32'

file_id="recwoo"
bird_name="Red-cockaded Woodpecker"
bcr_id='27'

file_id="lewwoo"
bird_name="Lewis’s Woodpecker"
bcr_id='9 and 10'

In [None]:
PATH='/content/drive/My Drive/Colab Notebooks/dissertation/'
ebird_ss=pd.read_csv(PATH+'ebird_ss_'+file_id+'_add30yMonth.csv')

## Define useful functions

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        if col != 'time':
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Drop/fix NaN values

In [None]:
#Check
print((ebird_ss.isna().describe().loc['unique']==2).sort_values())

In [None]:
if file_id=="nutwoo":
    # For Nuttall's Woodpecker
    print(ebird_ss.prec30_cv.isna().value_counts())
    print(ebird_ss.prec180_cv.isna().value_counts())
    print(ebird_ss.observation_count.isna().value_counts())
    ebird_ss.loc[ebird_ss['prec30_cv'].isna(),'prec30_cv']=0
    ebird_ss.loc[ebird_ss['prec180_cv'].isna(),'prec180_cv']=0
    ebird_ss.drop(columns=['observation_count'],inplace=True)

elif file_id=="recwoo":
    # For "Red-cockaded Woodpecker"
    print(ebird_ss.prec30_cv.isna().value_counts())
    print(ebird_ss.elevation_median.isna().value_counts())
    print(ebird_ss.elevation_sd.isna().value_counts())
    print(ebird_ss.observation_count.isna().value_counts())
    ebird_ss.drop(columns=['observation_count'],inplace=True)
    ebird_ss.dropna(inplace=True)
    ebird_ss.reset_index(drop=True,inplace=True)

elif file_id=="lewwoo":
    # For "Lewis’s Woodpecker"
    print(ebird_ss.prec30_cv.isna().value_counts())
    print(ebird_ss.observation_count.isna().value_counts())
    ebird_ss.loc[ebird_ss['prec30_cv'].isna(),'prec30_cv']=0
    ebird_ss.drop(columns=['observation_count'],inplace=True)

else:
    print('Missing file_id')

In [None]:
ebird_ss=reduce_mem_usage(ebird_ss)

## Set variables for use

In [None]:
variables_base=['species_observed',
'time_observations_started',
'duration_minutes',
'effort_distance_km',
 'number_observers',
 'bio1',
 'bio2',
 'bio3',
 'bio4',
 'bio5',
 'bio6',
 'bio7',
 'bio8',
 'bio9',
 'bio10',
 'bio11',
 'bio12',
 'bio13',
 'bio14',
 'bio15',
 'bio16',
 'bio17',
 'bio18',
 'bio19',
 'prec30_mean',
 'prec180_mean',
 'prec365_mean',
 'prec730_mean',
 'prec1095_mean',
 'prec1460_mean',
 'prec1825_mean',
 'tmp30_mean',
 'tmp30_std',
 'tmp180_mean',
 'tmp180_std',
 'tmp365_mean',
 'tmp365_std',
 'tmp730_mean',
 'tmp730_std',
 'tmp1095_mean',
 'tmp1095_std',
 'tmp1460_mean',
 'tmp1460_std',
 'tmp1825_mean',
 'tmp1825_std',
 'tmax30_mean',
 'tmax30_std',
 'tmax180_mean',
 'tmax180_std',
 'tmax365_mean',
 'tmax365_std',
 'tmax730_mean',
 'tmax730_std',
 'tmax1095_mean',
 'tmax1095_std',
 'tmax1460_mean',
 'tmax1460_std',
 'tmax1825_mean',
 'tmax1825_std',
 'tmin30_mean',
 'tmin30_std',
 'tmin180_mean',
 'tmin180_std',
 'tmin365_mean',
 'tmin365_std',
 'tmin730_mean',
 'tmin730_std',
 'tmin1095_mean',
 'tmin1095_std',
 'tmin1460_mean',
 'tmin1460_std',
 'tmin1825_mean',
 'tmin1825_std',
 'prec30_cv',
 'prec180_cv',
 'prec365_cv',
 'prec730_cv',
 'prec1095_cv',
 'prec1460_cv',
 'prec1825_cv',
 'tmin_30y_monthly',
 'tmax_30y_monthly',
 'tavg_30y_monthly',
 'prec_30y_monthly',
 'srad_30y_monthly',
 'wind_30y_monthly',
 'vapr_30y_monthly',
 'pland_00_water',
 'pland_01_evergreen_needleleaf',
 'pland_02_evergreen_broadleaf',
 'pland_03_deciduous_needleleaf',#<--remove for nutwoo
 'pland_04_deciduous_broadleaf',
 'pland_05_mixed_forest',
 'pland_06_closed_shrubland',
 'pland_07_open_shrubland',
 'pland_08_woody_savanna',
 'pland_09_savanna',
 'pland_10_grassland',
 'pland_11_wetland',
 'pland_12_cropland',
 'pland_13_urban',
 'pland_14_mosiac',
 'pland_15_barren',
'elevation_median',
'elevation_sd'
]
if file_id=="nutwoo":
    variables_base.remove('pland_03_deciduous_needleleaf')

# Number of the checklists in BCR 32 last 10 years

In [None]:
fig, ax=plt.subplots(1,figsize=(7,5))
year=[2010,2011,   2012,   2013,   2014,   2015,   2016,   2017,   2018,   2019]
checklist=[35897,  48820,  69221,  96099, 111156, 130727, 143785, 172502, 217373, 239270]
plt.bar(year,checklist)
for (index, value) in zip(year, checklist):
    ax.text(index-0.5,value+3500, str(value))
    
ax.set_ylabel('Number of checklists in BCR 32',size=15)
ax.set_xlabel('Year',size=15)
plt.savefig('temporal_bcr32.png',bbox_inches='tight')

# Explanatory data analysis

In [None]:
filter_col = [col for col in ebird_ss if col.startswith('pland_')]

In [None]:
climatic_col=['bio1','bio4', 'bio12', 'bio15']

In [None]:
df_eda_ss=pd.concat([pd.cut(ebird_ss.time_observations_started, bins=np.arange(0,25,1),include_lowest=False),
ebird_ss.species_observed,
ebird_ss.duration_minutes,
ebird_ss.effort_distance_km,
ebird_ss.number_observers,
ebird_ss.elevation_median,
ebird_ss.elevation_sd,
ebird_ss[filter_col],
ebird_ss[climatic_col],
],axis=1)

In [None]:
df_eda_ss=df_eda_ss[df_eda_ss['number_observers']>0]

## Time observation started

In [None]:
fig, ax1=plt.subplots(figsize=(23,2.5))
ax2=ax1.twinx()

df_eda_ss.time_observations_started.value_counts().sort_index()\
.plot(kind='bar',color='orange',ax=ax1,label='Number of checklists')
(df_eda_ss[df_eda_ss["species_observed"]==True].time_observations_started.value_counts().sort_index()\
 /df_eda_ss.time_observations_started.value_counts().sort_index())\
 .plot(kind='line',marker="o",color='blue',ax=ax2,label='Encounter rate')

ax1.set_zorder(1)
ax2.set_zorder(2)
ax1.patch.set_alpha(0)

ax1.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0.5, fontsize=15)
ax2.legend(bbox_to_anchor=(1, 0.8), loc='upper right', borderaxespad=0.5, fontsize=15)

ax1.grid(True)
ax1.set_title(f'Dataset of {bird_name} in BCR {bcr_id} after subsampling (n={len(df_eda_ss)})',size='15')

ax1.set_xlabel('Time (24-hour format)',size='15')
ax1.set_ylabel('Number of checklists',size='15')
ax2.set_ylabel('Encounter rate',size='15')
ax1.tick_params(labelsize=15)
ax2.tick_params(labelsize=15)

fig.savefig(PATH+f'figures/time_{file_id}.png',bbox_inches='tight')
plt.show()

## Duration of the observation

In [None]:
fig, ax1=plt.subplots(figsize=(23,2.5))
ax2=ax1.twinx()

bins=np.arange(0,315,15)
df_eda_ss.duration_minutes.plot(kind='hist',color='orange',ax=ax1,bins=bins,label='Number of checklists')
ax2.plot(bins[1:]-7.5,
         (np.histogram(df_eda_ss[df_eda_ss["species_observed"]==True].duration_minutes,bins=bins)[0])/\
         (np.histogram(df_eda_ss.duration_minutes,bins=bins)[0]),
         marker="o",color='blue',label='Encounter rate')

ax1.set_zorder(1)
ax2.set_zorder(2)
ax1.patch.set_alpha(0)

ax1.legend(bbox_to_anchor=(0, 1), loc='upper left', borderaxespad=0.5, fontsize=15)
ax2.legend(bbox_to_anchor=(0, 0.8), loc='upper left', borderaxespad=0.5, fontsize=15)

ax1.set_title(f'Dataset of {bird_name} in BCR {bcr_id} after subsampling (n={len(df_eda_ss)})',size='15')
ax1.grid(True)

ax1.set_xlabel('Duration of observation (minutes)',size='15')
ax1.set_ylabel('Number of checklists',size='15')
ax2.set_ylabel('Encounter rate',size='15')
ax1.tick_params(labelsize=15)
ax2.tick_params(labelsize=15)

fig.savefig(PATH+f'figures/duration_{file_id}.png',bbox_inches='tight')
plt.show()

## Distance of the observation

In [None]:
fig, ax1=plt.subplots(figsize=(23,2.5))
ax2=ax1.twinx()

bins=np.arange(0,5.5,0.5)
df_eda_ss.effort_distance_km.plot(kind='hist',color='orange',ax=ax1,bins=bins,label='Number of checklists')
ax2.plot(bins[1:]-0.25,
         (np.histogram(df_eda_ss[df_eda_ss["species_observed"]==True].effort_distance_km,bins=bins)[0])/\
         (np.histogram(df_eda_ss.effort_distance_km,bins=bins)[0]),
         marker="o",color='blue',label='Encounter rate')

ax1.set_zorder(1)
ax2.set_zorder(2)
ax1.patch.set_alpha(0)

ax1.legend(bbox_to_anchor=(1, 0.45), loc='upper right', borderaxespad=0.5, fontsize=15)
ax2.legend(bbox_to_anchor=(1, 0.3), loc='upper right', borderaxespad=0.5, fontsize=15)

ax1.set_title(f'Dataset of {bird_name} in BCR {bcr_id} after subsampling (n={len(df_eda_ss)})',size=15)
ax1.grid(True)

ax1.set_xlabel('Distance (km)',size=15)
ax1.set_ylabel('Number of checklists',size=15)
ax2.set_ylabel('Encounter rate',size=15)
ax1.tick_params(labelsize=15)
ax2.tick_params(labelsize=15)

fig.savefig(PATH+f'figures/distance_{file_id}.png',bbox_inches='tight')
plt.show()

##Number of observers

In [None]:
fig, ax1=plt.subplots(figsize=(23,2.5))
ax2=ax1.twinx()

df_eda_ss.number_observers.value_counts().sort_index().plot(kind='bar',color='orange',ax=ax1,label='Number of checklists')
ax2.plot(np.arange(0,10,1),
         (df_eda_ss[df_eda_ss["species_observed"]==True].number_observers.value_counts().sort_index()/df_eda_ss.number_observers.value_counts().sort_index()),
         marker="o",color='blue',label='Encounter rate')

ax1.set_zorder(1)
ax2.set_zorder(2)
ax1.patch.set_alpha(0)

ax1.legend(bbox_to_anchor=(1, 0.45), loc='upper right', borderaxespad=0.5, fontsize=15)
ax2.legend(bbox_to_anchor=(1, 0.3), loc='upper right', borderaxespad=0.5, fontsize=15)

ax1.set_title(f'Dataset of {bird_name} in BCR {bcr_id} after subsampling (n={len(df_eda_ss)})',size=15)
ax1.grid(True)

ax1.set_xlabel('Number of observers',size=15)
ax1.set_xticklabels(np.arange(1,11,1),rotation=0)
ax1.set_ylabel('Number of checklists',size=15)
ax2.set_ylabel('Encounter rate',size=15)
ax1.tick_params(labelsize=15)
ax2.tick_params(labelsize=15)

fig.savefig(PATH+f'figures/observers_{file_id}.png',bbox_inches='tight')
plt.show()

## Elevation

In [None]:
fig, ax1=plt.subplots(figsize=(23,2.5))
ax2=ax1.twinx()

if file_id=="nutwoo":
    bins=np.arange(-10,3200,100) #<---for nutwoo

elif file_id=="recwoo":
    bins=np.arange(-10,330,10) #<---for recwoo

elif file_id=="lewwoo":
    bins=np.arange(0,3900,100) #<---for lewwoo

df_eda_ss.elevation_median.plot(kind='hist',color='orange',ax=ax1,bins=bins,label='Number of checklists')
ax2.plot(bins[1:]-(bins[1]-bins[0])/2,
         (np.histogram(df_eda_ss[df_eda_ss["species_observed"]==True].elevation_median,bins=bins)[0])/\
         (np.histogram(df_eda_ss.elevation_median,bins=bins)[0]),
         marker="o",color='blue',label='Encounter rate')

ax1.set_zorder(1)
ax2.set_zorder(2)
ax1.patch.set_alpha(0)

ax1.legend(bbox_to_anchor=(1, 0.6), loc='upper right', borderaxespad=0.5, fontsize=15)
ax2.legend(bbox_to_anchor=(1, 0.4), loc='upper right', borderaxespad=0.5, fontsize=15)

ax1.set_title(f'Dataset of {bird_name} in BCR {bcr_id} after subsampling (n={len(df_eda_ss)})',size=15)
ax1.grid(True)

ax1.set_xlabel('Elevation (m)',size=15)
ax1.set_ylabel('Number of checklists',size=15)
ax2.set_ylabel('Encounter rate',size=15)
ax1.tick_params(labelsize=15)
ax2.tick_params(labelsize=15)
fig.savefig(PATH+f'figures/elevation_{file_id}.png',bbox_inches='tight')
plt.show()

## Land Use

In [None]:
labels = filter_col
Observed = df_eda_ss.groupby('species_observed').mean()[filter_col].loc[True,:]
NotObserved = df_eda_ss.groupby('species_observed').mean()[filter_col].loc[False,:]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(20,6))
rects1 = ax.bar(x - width/2, Observed, width, label='True (encountered)',
                #yerr=df_eda_ss.groupby('species_observed').std()[filter_col].loc[True,:],
                color='orange')
rects2 = ax.bar(x + width/2, NotObserved, width, label='False (not encountered)',
                #yerr=df_eda_ss.groupby('species_observed').std()[filter_col].loc[False,:],
                color='blue')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Proportion of the area',size=15)
ax.set_xlabel('Land use category',size=15)
ax.set_xticks(x)
labels2=list(map(lambda x: x[6:], labels))
ax.set_xticklabels(labels2,rotation=45,size=15)
ax.legend(loc='best', fontsize=15)

ax.set_title(f'Dataset of {bird_name} in BCR {bcr_id} after subsampling (n={len(df_eda_ss)})',size=15)
fig.tight_layout()
ax.tick_params(labelsize=15)
fig.savefig(PATH+f'figures/landuse_{file_id}.png',bbox_inches='tight')
plt.show()

## Climatic

In [None]:
import seaborn as sns
fig, axes=plt.subplots(1,4,figsize=(21,2.5))
for i,(itm, ax) in enumerate(zip(climatic_col,axes)):
    b=sns.boxplot(y=itm, x='species_observed', data=df_eda_ss, palette=['lightblue','orange'],ax=ax)
    ax.set_xlabel('Encountered',fontsize=15)
    b.set_ylabel(climatic_col[i],fontsize=15)
fig.suptitle(f'Dataset of {bird_name} in BCR {bcr_id} after subsampling (n={len(df_eda_ss)})',x=0.5,y=1.05,fontsize=15)
fig.tight_layout()
axes[0].tick_params(labelsize=15)
axes[1].tick_params(labelsize=15)
axes[2].tick_params(labelsize=15)
axes[3].tick_params(labelsize=15)
fig.savefig(PATH+f'figures/climatic_{file_id}.png',bbox_inches='tight')
plt.show()