In [1]:
#Import Necessary Packages
from pyinaturalist import node_api
import json
import pandas as pd
from pandas import DataFrame as df
import datetime
from pandas import json_normalize
from pandas.core.common import SettingWithCopyWarning
import warnings
#Suppress Unncessary Warnings
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [2]:
#Paginated Data (Use either this OR next cell)
#results = []
#for i in range(1,38):
#    page_resp = node_api.get_observations(params={"project_id": 70438, "page": i, "per_page": 200, "order":"desc", "order_by":"created_at"})
#    results = results + page_resp['results']
#df = json_normalize(data=results, record_path='results')

In [3]:
#Consolidated Data (Use either this OR previous cell)
#70438 is Lockdown-Backyard-Bioblitz-Kerala Project (approx. 400MB)
#This would take several minutes to hours, depending on the project(s)
PROJECT_ID = 70438
page_resps = node_api.get_all_observations(params={"project_id": PROJECT_ID})

In [4]:
#Save data to local file for future reference
with open("inat-project-{}-pristine-data.json".format(PROJECT_ID), 'w+') as f:
    json.dump(page_resps, f)

In [122]:
#Normalize first level data into flat table 
df = json_normalize(data=page_resps)

In [123]:
#Create Empty Derived Features
df.insert(loc = len(df.columns), column='annotations_count', value = 0)
df.insert(loc = len(df.columns), column='account_age', value = '')
df.insert(loc = len(df.columns), column='sounds_count', value = 0)
df.insert(loc = len(df.columns), column='photos_count', value = 0)
df.insert(loc = len(df.columns), column='taxon_default_photo', value = False)
df.insert(loc = len(df.columns), column='taxon_common_name', value = False)
df.insert(loc = len(df.columns), column='user_icon', value = False)
df.insert(loc = len(df.columns), column='user_name', value = False)
df.insert(loc = len(df.columns), column='taxon_wiki', value = False)
df.insert(loc = len(df.columns), column='projects_count', value = 0)
MAX_PHOTOS = 10
MAX_IDS = 10
#Create Empty Flattened Features
for i in range(1, MAX_IDS+1):
    df.insert(loc = len(df.columns), column='id_{0}_category'.format(i), value = None)
    df.insert(loc = len(df.columns), column='id_{0}_created_after'.format(i), value = 0)
    df.insert(loc = len(df.columns), column='id_{0}_own_observation'.format(i), value = False)
    df.insert(loc = len(df.columns), column='id_{0}_default_photo'.format(i), value = False)
    df.insert(loc = len(df.columns), column='id_{0}_taxon_obs_count'.format(i), value = 0)
    df.insert(loc = len(df.columns), column='id_{0}_common_name'.format(i), value = False)
    df.insert(loc = len(df.columns), column='id_{0}_taxon_rank'.format(i), value = 0)
    df.insert(loc = len(df.columns), column='id_{0}_taxon_wiki_url'.format(i), value = False)
    df.insert(loc = len(df.columns), column='id_{0}_acc_age'.format(i), value = 0)
    df.insert(loc = len(df.columns), column='id_{0}_user_obs_count'.format(i), value = 0)
    df.insert(loc = len(df.columns), column='id_{0}_user_sp_count'.format(i), value = 0)
    df.insert(loc = len(df.columns), column='id_{0}_vision'.format(i), value = False)
    df.insert(loc = len(df.columns), column='id_{0}_user_id_count'.format(i), value = 0)
    df.insert(loc = len(df.columns), column='id_{0}_current'.format(i), value = False)
    df.insert(loc = len(df.columns), column='id_{0}_disagreement'.format(i), value = False)
    df.insert(loc = len(df.columns), column='photo_{0}_width'.format(i), value = False)
    df.insert(loc = len(df.columns), column='photo_{0}_height'.format(i), value = False)

In [124]:
#Derived and Type Casted Features
for index, obs in df.iterrows():
    df['account_age'][index] = (pd.to_datetime(df['created_at'][index], infer_datetime_format=True).date()-pd.to_datetime(df['user.created_at'][index], infer_datetime_format=True).date()).days
    df['annotations_count'][index] = len(obs['annotations'])
    df['sounds_count'][index] = len(obs['sounds'])
    df['photos_count'][index] = len(obs['photos'])
    df['projects_count'][index] = len(obs['project_ids'])
    if obs['taxon.default_photo.url'] is not None:
        df['taxon_default_photo'][index] = True
    if obs['taxon.preferred_common_name'] is not None:
        df['taxon_common_name'][index] = True  
    if obs['user.icon_url'] is not None:
        df['user_icon'][index] = True
    if obs['user.name_autocomplete'] is not None:
        df['user_name'][index] = True
    if obs['taxon.wikipedia_url'] is not None:
        df['taxon_wiki'][index] = True

In [125]:
#Flatten the multi-level dictionary into a matrix for easier analysis
#json_normalize() doesn't seem to do a good job in this case. 
#This loop is approximately O(m*n), where m is total number of observations and n is total number of identifications
#Should find a more elegant solution to form a flat-table

#Flatten Identifications
i = 0
for ids in df['identifications']:
    j = 1
    for ident in ids:
        df['id_{}_category'.format(j)][i] = ident['category']
        df['id_{}_own_observation'.format(j)][i] = ident.get('own_observation', False)
        df['id_{}_user_obs_count'.format(j)][i] = ident['user'].get('observations_count', 0)
        df['id_{}_user_sp_count'.format(j)][i] = ident['user'].get('species_count', 0)
        df['id_{}_user_id_count'.format(j)][i] = ident['user'].get('identifications_count', 0)
        df['id_{}_vision'.format(j)][i] = ident.get('vision', False)
        df['id_{}_current'.format(j)][i] = ident.get('current', False)
        df['id_{}_disagreement'.format(j)][i] = ident.get('disagreement', False)
        df['id_{}_taxon_obs_count'.format(j)][i] = ident['taxon'].get('observations_count', 0)
        df['id_{}_taxon_rank'.format(j)][i] = ident['taxon'].get('rank_level', 0)
        if ident['taxon']['wikipedia_url'] is not None:
            df['id_{}_taxon_wiki_url'.format(j)][i] = True 
        if ident['taxon']['default_photo'] is not None:
            df['id_{}_default_photo'.format(j)][i] = True
        df['id_{}_created_after'.format(j)][i] = (pd.to_datetime(ident['created_at'], infer_datetime_format=True).date() - pd.to_datetime(df['created_at'][i], infer_datetime_format=True).date()).days
        df['id_{}_acc_age'.format(j)][i] = (pd.to_datetime(ident['created_at'], infer_datetime_format=True).date() - pd.to_datetime(ident['user']['created_at'], infer_datetime_format=True).date()).days
        j += 1
        if j > MAX_IDS:
            break
    i += 1
#Flatten Photos
i = 0
for photos in df['photos']:
    j = 1
    for photo in photos:
        df['photo_{}_width'.format(j)][i] = photo['original_dimensions']['width']
        df['photo_{}_height'.format(j)][i] = photo['original_dimensions']['height']
        j += 1
        if j > MAX_PHOTOS:
            break
    i += 1    

In [9]:
#Declare Quality Grade (Research and Needs_ID), IUCN Conservation Status (Near Threatened, Threatened, Endangered) and ID Category (Improving, Supporting, Leading) as Categorical
df['code'] = df.quality_grade.astype('category').cat.codes
for i in range(1,11):
    df['id_{0}_category'.format(i)]=df['id_{0}_category'.format(i)].astype('category').cat.codes
df['taxon.conservation_status.code'] = df['taxon.conservation_status.status'].astype('category').cat.codes

In [126]:
#Save data to local file for future reference
df.to_excel("inat-project-{}-intermediate-data.xlsx".format(PROJECT_ID))

In [10]:
#Drop Features that are not used for this Study
df = df.drop(columns=['public_positional_accuracy', 'positional_accuracy', 'taxon_geoprivacy', 'uuid', 'site_id', 'created_time_zone', 'license_code', 'observed_time_zone', 'reviewed_by', 'flags', 'description', 'time_zone_offset', 'place_ids', 'taxon.conservation_status.iucn', 'taxon.conservation_status.authority'])
df = df.drop(columns=['tags', 'observed_on_string', 'outlinks', 'ofvs', 'map_scale', 'location', 'votes', 'spam', 'project_ids_without_curator_id', 'place_guess', 'project_observations', 'observed_on_details.date', 'observed_on_details.week', 'observed_on_details.month', 'project_ids', 'created_at'])
df = df.drop(columns=['uri', 'community_taxon_id', 'observed_on_details.hour', 'observed_on_details.year', 'observed_on_details.day', 'created_at_details.date', 'created_at_details.week', 'created_at_details.month', 'created_at_details.year', 'taxon.ancestry', 'taxon.min_species_ancestry'])
df = df.drop(columns=['taxon.iconic_taxon_id', 'updated_at', 'user.universal_search_rank', 'geojson.type', 'geojson.coordinates', 'preferences.prefers_community_taxon', 'taxon.default_photo.original_dimensions.height', 'taxon.default_photo.original_dimensions.width', 'observed_on', 'photos'])
df = df.drop(columns=['geoprivacy', 'faves', 'taxon.is_active', 'taxon.min_species_taxon_id', 'taxon.parent_id', 'taxon.name', 'taxon.rank', 'taxon.id', 'taxon.ancestor_ids', 'taxon.taxon_schemes_count', 'taxon.created_at', 'taxon.taxon_changes_count', 'taxon.complete_species_count', 'taxon.universal_search_rank'])
df = df.drop(columns=['user.login_exact', 'time_observed_at', 'species_guess', 'quality_metrics', 'project_ids_with_curator_id', 'taxon.default_photo.attribution', 'taxon.default_photo.flags', 'taxon.default_photo.id', 'taxon.default_photo.license_code', 'user.site_id', 'taxon.conservation_status.geoprivacy'])
df = df.drop(columns=['comments', 'non_owner_ids', 'observation_photos', 'annotations', 'sounds', 'taxon.photos_locked', 'taxon.atlas_id', 'taxon.iconic_taxon_name', 'user.id', 'user.login_autocomplete', 'user.orcid', 'taxon.default_photo.url', 'taxon.preferred_common_name', 'taxon.conservation_status.status_name'])
df = df.drop(columns=['user.created_at', 'user.suspended', 'taxon.default_photo.square_url', 'taxon.default_photo.medium_url', 'taxon.flag_counts.resolved', 'taxon.flag_counts.unresolved', 'user.icon_url', 'user.name_autocomplete', 'user.roles', 'user.icon', 'user.name', 'taxon.current_synonymous_taxon_ids', 'taxon.wikipedia_url'])
df = df.drop(columns=['taxon.conservation_status.user_id', 'taxon.conservation_status.place_id', 'taxon.default_photo.original_dimensions', 'taxon.default_photo', 'taxon', 'observation_sounds', 'ident_taxon_ids', 'taxon.complete_rank', 'identifications', 'taxon.conservation_status.status', 'taxon.conservation_status.source_id'])
df = df.drop(columns=['faves_count', 'cached_votes_total', 'faves_count', 'cached_votes_total', 'id', 'quality_grade', 'sounds_count', 'taxon.endemic', 'taxon.threatened', 'identifications_most_agree', 'num_identification_agreements', 'taxon_default_photo'])

In [12]:
#Save data to local file for future reference
df.to_excel("inat-project-{}-processed-data.xlsx".format(PROJECT_ID))

In [13]:
#Inspect Features
pd.set_option('display.max_columns', None)
print(df.describe())

                 id  cached_votes_total  comments_count  oauth_application_id  \
count  7.366000e+03         7366.000000     7366.000000           5463.000000   
mean   4.654353e+07            0.030546        0.072631              4.671243   
std    3.259310e+06            0.191513        0.386138             29.248182   
min    4.059016e+07            0.000000        0.000000              2.000000   
25%    4.458544e+07            0.000000        0.000000              2.000000   
50%    4.659301e+07            0.000000        0.000000              2.000000   
75%    4.796044e+07            0.000000        0.000000              2.000000   
max    6.020295e+07            4.000000       10.000000            333.000000   

       faves_count  num_identification_agreements  identifications_count  \
count  7366.000000                     7366.00000            7366.000000   
mean      0.021721                        1.14757               1.166305   
std       0.158285                        

In [14]:
#Check for Skewed Features, they ought to be removed for better accuracy
pd.set_option('display.max_rows', None)
df.skew().sort_values()

id_1_default_photo                  -20.159121
photo_1_height                      -12.401309
photo_1_width                       -12.401309
user_name                            -4.823138
mappable                             -4.326745
user_icon                            -3.987657
id_1_taxon_wiki_url                  -3.690888
taxon_wiki                           -2.727856
id_1_current                         -1.978657
id_1_own_observation                 -1.883865
id_2_default_photo                   -0.828993
id_2_current                         -0.723734
identifications_most_agree           -0.712659
identifications_some_agree           -0.712659
created_at_details.hour              -0.604916
id_2_taxon_wiki_url                  -0.527445
created_at_details.day               -0.156731
id_2_common_name                      0.000000
id_10_own_observation                 0.000000
taxon_default_photo                   0.000000
id_10_common_name                     0.000000
taxon_common_

In [None]:
# Effect of this operation on correlation is not determined yet. Use with caution. 
'''
for i in range(1,11):
    df['id_{0}_category'.format(i)]=df['id_{0}_category'.format(i)].astype('category').cat.codes
    df['id_{0}_own_observation'.format(i)]=df['id_{0}_own_observation'.format(i)].astype(int)
    df['id_{0}_default_photo'.format(i)]=df['id_{0}_default_photo'.format(i)].astype(int)
    df['id_{0}_common_name'.format(i)]=df['id_{0}_common_name'.format(i)].astype(int)
    df['id_{0}_taxon_wiki_url'.format(i)]=df['id_{0}_taxon_wiki_url'.format(i)].astype(int)
    df['id_{0}_vision'.format(i)]=df['id_{0}_vision'.format(i)].astype(int)
    df['id_{0}_current'.format(i)]=df['id_{0}_current'.format(i)].astype(int)
    df['id_{0}_disagreement'.format(i)]=df['id_{0}_disagreement'.format(i)].astype(int)
df['taxon_wiki']=df['taxon_wiki'].astype(int)
df['user_name']=df['user_name'].astype(int)
df['user_icon']=df['user_icon'].astype(int)
df['taxon_common_name']=df['taxon_common_name'].astype(int)
df['taxon_default_photo']=df['taxon_default_photo'].astype(int)
df['identifications_most_agree']=df['identifications_most_agree'].astype(int)
df['identifications_most_disagree']=df['identifications_most_disagree'].astype(int)
df['owners_identification_from_vision']=(df['owners_identification_from_vision'].astype(bool)).astype(int)
df['obscured']=df['obscured'].fillna(0).astype(int)
df['mappable']=df['mappable'].fillna(0).astype(int)
df['identifications_some_agree']=df['identifications_some_agree'].fillna(0).astype(int)
df['taxon.endemic']=df['taxon.endemic'].fillna(0).astype(int)
df['taxon.threatened']=df['taxon.threatened'].fillna(0).astype(int)
df['taxon.introduced']=df['taxon.introduced'].fillna(0).astype(int)
df['taxon.extinct']=df['taxon.extinct'].fillna(0).astype(int)
df['user.spam']=df['user.spam'].fillna(0).astype(int)
'''

In [111]:
#Credits: This cell is inspired from https://www.kaggle.com/sharmasanthosh/exploratory-study-on-feature-selection.
import numpy

# Correlation tells relation between two attributes.
# Correlation requires continous data. Hence, ignore Wilderness_Area and Soil_Type as they are binary

#sets the number of features considered
size = len(df.columns) 

#create a dataframe with only 'size' features
data=df.iloc[:,:size] 

#get the names of all the columns
cols=data.columns 

#Calculates correlation co-efficient for all features to label
#Change method to pearson/kendall/spearman, as needed.
data_corr = df.corr(method='spearman')['code'][:]

rows = len(data_corr)

# Set the threshold to select only highly correlated attributes
threshold = 0.40

# List of pairs along with correlation above threshold
corr_list = []

#Search for the highly correlated pairs
for i in range(0,rows): #for 'size' features
    if (data_corr.iloc[i] >= threshold and data_corr.iloc[i] < 1) or (data_corr.iloc[i] < 0 and data_corr.iloc[i] <= -threshold):
        corr_list.append([data_corr.iloc[i],i]) #store correlation and columns index

#Sort to show higher ones first            
s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))

#Print correlations and column names
for v,i in s_corr_list:
    print ("%s = %.2f" % (cols[i],v))

created_at_details.day = -0.74
id_1_vision = 0.67
mappable = 0.60
owners_identification_from_vision = 0.59
id_1_disagreement = 0.57
photos_count = -0.56
id_2_taxon_wiki_url = 0.56
id_1_category = -0.54
id_2_created_after = 0.53
id_2_own_observation = 0.52
id_2_default_photo = 0.48
id_2_taxon_obs_count = 0.47
taxon.rank_level = -0.45
id_2_vision = 0.44
id_3_own_observation = 0.42
id_2_taxon_rank = 0.42
id_3_taxon_wiki_url = 0.41
id_2_disagreement = 0.41
id_3_taxon_rank = 0.41
id_3_default_photo = 0.41
id_3_taxon_obs_count = 0.40


In [22]:
df2 = pd.read_json('inat-project-70438-pristine-data.json')

In [98]:
#df2.insert(loc = len(df2.columns), column='id_{0}_category'.format(1), value = None)
i = 0
for ids in df2['identifications']:
    for ident in ids:
        df2['id_{}_category'.format(1)][i] = ident['category']
        break
    i += 1
df2['id_1_category'].value_counts()

improving     4218
leading       2740
maverick       360
supporting      36
Name: id_1_category, dtype: int64

In [117]:
df['id_1_category'].value_counts()

 0    4218
 1    2740
 2     360
 3      36
-1      12
Name: id_1_category, dtype: int64

In [120]:
df['id_3_created_after'].value_counts()

 0      5487
 1       771
 2       208
 3       100
 4        70
 6        62
 5        44
 8        27
 7        27
 10       25
 11       22
 16       20
 9        20
 15       17
-1        15
 20       12
 42       11
 13       11
 14       10
 26       10
 28        9
 19        9
 24        8
 23        8
 77        8
 27        8
 22        7
 38        7
 98        7
 21        7
 97        7
 89        7
 25        7
 94        7
 17        7
 39        7
 64        7
 74        6
 90        6
 101       6
 18        6
 61        6
 48        6
 45        6
 12        6
 81        5
 52        5
 83        5
 88        5
 35        5
 72        5
 36        5
 41        4
 33        4
 128       4
 112       4
 100       4
 118       4
 59        4
 68        4
 30        4
 34        4
 80        4
 102       4
 47        3
 49        3
 69        3
 86        3
 57        3
 95        3
 99        3
 56        3
 91        3
 106       3
 107       3
 115       3
 73        3

In [121]:
(df['id_2_taxon_wiki_url'].astype(str) + df2['quality_grade']).value_counts()

Trueresearch     2875
Falseneeds_id    2513
Trueneeds_id     1747
Falseresearch     231
dtype: int64