In [47]:
# Initial Data Preprocessing and Importing #####################################################################################
#My first step in preprocessing was picking out what columns were important to include before
#even downloading the dataset, and pulling only those features.
#I did however leave some columns, such as links, for self reference, but will remove them here.

# Imports
import numpy as np
import scipy as sp
import pandas as pd
from IPython.display import display, HTML

df=pd.read_csv('observations.csv')
print(df.shape)
cols=df.columns


# print out and display dataframe as tables in HTML
display(HTML(df.head(5).to_html()))

# remove unnecessary columns
df=df.drop('id',1)
# Url's are only for post reference, and not needed in data.
df=df.drop('url',1)
df=df.drop('image_url',1)

# after reading and searching through the data, these columns were also found unecessary:
# Captive Cultivated: No rows were TRUE. Every value is FALSE, so this data is not useful.
df=df.drop('captive_cultivated',1)
# Geospatial data isn't too important, as the only positional data I would use anyway would
#be latitude and longitude, and not map data, so I don't need positional_accuracy.
df=df.drop('positional_accuracy',1)
# observed_on_string is just a different format of observed_on with extra data, so I'm removing it.
df=df.drop('observed_on_string',1)
# I'm replacing the time column with ONLY times, and filling missing values with averages.
#I also filled missing values with fillna method ffill, which just pulls the most recent time
#value above it for itself.
df['time_observed_at'] = df['time_observed_at'].replace(to_replace='[0-9]{4}-[0-9]{2}-[0-9]{2}', value='', regex=True)
df['time_observed_at'] = df['time_observed_at'].replace(to_replace='\+[0-9]{4}', value='', regex=True)
df['time_observed_at'].fillna(method='ffill', inplace=True)
# Time values aren't that important directly on my classification, so I'm also using ffill for 
#missing values in time_zone
df['time_zone'].fillna(method='ffill', inplace=True)
# Replacing missing values in place_guess to none
df['place_guess'].fillna(value='None', inplace=True)
# Replacing missing values in species_guess to none, since no exact species was identified.
df['species_guess'].fillna(value='None', inplace=True)


# print out and display dataframe as tables in HTML after removing columns
display(HTML(df.head(5).to_html()))
# Redefine columns
cols=df.columns
# Checking Datatypes of columns and for missing values
print('ColumnName, DataType, MissingValues')
for i in cols:
    print(i, ',', df[i].dtype,',',df[i].isnull().any())
    

(71717, 16)


Unnamed: 0,id,observed_on_string,observed_on,time_observed_at,time_zone,url,image_url,captive_cultivated,place_guess,latitude,longitude,positional_accuracy,species_guess,scientific_name,common_name,iconic_taxon_name
0,1292466,3/14/2015 14:35,3/14/2015,2015-03-14 20:35:00 +0100,Central Time (US & Canada),http://www.inaturalist.org/observations/1292466,https://inaturalist-open-data.s3.amazonaws.com/photos/1621232/medium.JPG,False,Salt Creek Woods,41.827903,-87.884413,49.0,Northern Leopard Frog,Lithobates pipiens,Northern Leopard Frog,Amphibia
1,1297303,3/16/2015 12:26,3/16/2015,2015-03-16 18:26:00 +0100,Central Time (US & Canada),http://www.inaturalist.org/observations/1297303,https://inaturalist-open-data.s3.amazonaws.com/photos/1627861/medium.JPG,False,Warrenville Grove FP,41.82186,-88.172686,31.0,Common Snapping Turtle,Chelydra serpentina,Common Snapping Turtle,Reptilia
2,1297483,3/11/2015,3/11/2015,,Central Time (US & Canada),http://www.inaturalist.org/observations/1297483,http://static.inaturalist.org/photos/1627972/medium.JPG,False,"Marshall Road, Illinois, U.S.A.",37.320585,-88.914714,105.0,Common Snapping Turtle,Chelydra serpentina,Common Snapping Turtle,Reptilia
3,1299187,3/13/2015,3/13/2015,,Central Time (US & Canada),http://www.inaturalist.org/observations/1299187,http://static.inaturalist.org/photos/1630596/medium.JPG,False,"Promised Land Road, Pulaski county, Illinois, U.S.A.",37.119606,-89.304643,1161.0,Green Tree Frog,Hyla cinerea,Green Treefrog,Amphibia
4,1299190,3/13/2015,3/13/2015,,Central Time (US & Canada),http://www.inaturalist.org/observations/1299190,http://static.inaturalist.org/photos/1630599/medium.JPG,False,"Promised Land Road, Pulaski county, Illinois, U.S.A.",37.119606,-89.304643,1161.0,cope's gray treefrog,Hyla chrysoscelis,Cope's Gray Treefrog,Amphibia


Unnamed: 0,observed_on,time_observed_at,time_zone,place_guess,latitude,longitude,species_guess,scientific_name,common_name,iconic_taxon_name
0,3/14/2015,20:35:00,Central Time (US & Canada),Salt Creek Woods,41.827903,-87.884413,Northern Leopard Frog,Lithobates pipiens,Northern Leopard Frog,Amphibia
1,3/16/2015,18:26:00,Central Time (US & Canada),Warrenville Grove FP,41.82186,-88.172686,Common Snapping Turtle,Chelydra serpentina,Common Snapping Turtle,Reptilia
2,3/11/2015,18:26:00,Central Time (US & Canada),"Marshall Road, Illinois, U.S.A.",37.320585,-88.914714,Common Snapping Turtle,Chelydra serpentina,Common Snapping Turtle,Reptilia
3,3/13/2015,18:26:00,Central Time (US & Canada),"Promised Land Road, Pulaski county, Illinois, U.S.A.",37.119606,-89.304643,Green Tree Frog,Hyla cinerea,Green Treefrog,Amphibia
4,3/13/2015,18:26:00,Central Time (US & Canada),"Promised Land Road, Pulaski county, Illinois, U.S.A.",37.119606,-89.304643,cope's gray treefrog,Hyla chrysoscelis,Cope's Gray Treefrog,Amphibia


ColumnName, DataType, MissingValues
observed_on , object , False
time_observed_at , object , False
time_zone , object , False
place_guess , object , False
latitude , float64 , False
longitude , float64 , False
species_guess , object , False
scientific_name , object , False
common_name , object , False
iconic_taxon_name , object , False
