In [1]:
import pandas as pd
from sqlalchemy import create_engine

### Store CSV into DataFrame

In [10]:
csv_file = "Resources/Vultures_Acopian_Center_USA_2003-2016.csv"
vulture_data_df = pd.read_csv(csv_file, low_memory=False)
vulture_data_df.head()

Unnamed: 0,event-id,visible,timestamp,location-long,location-lat,algorithm-marked-outlier,gps:hdop,gps:satellite-count,gps-time-to-fix,gps:vdop,...,height-raw,location-error-text,manually-marked-outlier,raptor-workshop:migration-state,vertical-error-numerical,sensor-type,individual-taxon-canonical-name,tag-local-identifier,individual-local-identifier,study-name
0,2165431108,True,2004-09-06 17:00:00.000,-75.28533,40.778,,,,,,...,,,,,,gps,Cathartes aura,52067,Irma,Vultures Acopian Center USA 2003-2016
1,2165431109,True,2004-09-06 18:00:00.000,-75.28533,40.77817,,,,,,...,,,,,,gps,Cathartes aura,52067,Irma,Vultures Acopian Center USA 2003-2016
2,2165431110,True,2004-09-06 19:00:00.000,-75.28933,40.77433,,,,,,...,,,,,,gps,Cathartes aura,52067,Irma,Vultures Acopian Center USA 2003-2016
3,2165431111,True,2004-09-06 20:00:00.000,-75.289,40.77433,,,,,,...,,,,,,gps,Cathartes aura,52067,Irma,Vultures Acopian Center USA 2003-2016
4,2165431112,True,2004-09-07 00:00:00.000,-75.289,40.77417,,,,,,...,,,,,,gps,Cathartes aura,52067,Irma,Vultures Acopian Center USA 2003-2016


In [11]:
vulture_data_df.keys()

Index(['event-id', 'visible', 'timestamp', 'location-long', 'location-lat',
       'algorithm-marked-outlier', 'gps:hdop', 'gps:satellite-count',
       'gps-time-to-fix', 'gps:vdop', 'ground-speed', 'heading',
       'height-above-ellipsoid', 'height-raw', 'location-error-text',
       'manually-marked-outlier', 'raptor-workshop:migration-state',
       'vertical-error-numerical', 'sensor-type',
       'individual-taxon-canonical-name', 'tag-local-identifier',
       'individual-local-identifier', 'study-name'],
      dtype='object')

### Create new data with select columns

In [14]:
# Select columns 
new_vulture_data_df = vulture_data_df[['timestamp', 'location-long', 'location-lat','individual-taxon-canonical-name', 'tag-local-identifier',
       'individual-local-identifier']].copy()
new_vulture_data_df.head()

Unnamed: 0,timestamp,location-long,location-lat,individual-taxon-canonical-name,tag-local-identifier,individual-local-identifier
0,2004-09-06 17:00:00.000,-75.28533,40.778,Cathartes aura,52067,Irma
1,2004-09-06 18:00:00.000,-75.28533,40.77817,Cathartes aura,52067,Irma
2,2004-09-06 19:00:00.000,-75.28933,40.77433,Cathartes aura,52067,Irma
3,2004-09-06 20:00:00.000,-75.289,40.77433,Cathartes aura,52067,Irma
4,2004-09-07 00:00:00.000,-75.289,40.77417,Cathartes aura,52067,Irma


### Clean DataFrame

In [19]:
new_vulture_data_df.count()

timestamp                          686382
location-long                      684217
location-lat                       684217
individual-taxon-canonical-name    686382
tag-local-identifier               686382
individual-local-identifier        686382
dtype: int64

In [22]:
# drop rows without long and lat
new_vulture_data_df = new_vulture_data_df.dropna(how="any")
new_vulture_data_df.count()

timestamp                          684217
location-long                      684217
location-lat                       684217
individual-taxon-canonical-name    684217
tag-local-identifier               684217
individual-local-identifier        684217
dtype: int64

In [25]:
# filter data to only keep turkey vulture (Cathartes aura) data
new_vulture_data_df = new_vulture_data_df.loc[new_vulture_data_df
                                              ["individual-taxon-canonical-name"] == "Cathartes aura", :]
new_vulture_data_df.count()

timestamp                          548655
location-long                      548655
location-lat                       548655
individual-taxon-canonical-name    548655
tag-local-identifier               548655
individual-local-identifier        548655
dtype: int64

In [23]:
new_vulture_data_df.dtypes

timestamp                           object
location-long                      float64
location-lat                       float64
individual-taxon-canonical-name     object
tag-local-identifier                 int64
individual-local-identifier         object
dtype: object

In [None]:
# check to prevent duplicate loading (inqury for database)

### Store Info CSV to DataFrame

### Connect to local database

### Connect to local database

In [None]:
database_path = "etl"
engine = create_engine(f"sqlite:///{database_path}")

### Check for tables

In [None]:
engine.table_names()

### Use pandas to load csv converted DataFrame into database

In [None]:
new_customer_data_df.to_sql(name='customer_name', con=engine, if_exists='append', index=False)

### Use pandas to load json converted DataFrame into database

In [None]:
new_customer_location_df.to_sql(name='customer_location', con=engine, if_exists='append', index=False)

### Confirm data has been added by querying the customer_name table
* NOTE: can also check using pgAdmin

In [None]:
pd.read_sql_query('select * from customer_name', con=engine).head()

### Confirm data has been added by querying the customer_location table

In [None]:
pd.read_sql_query('select * from customer_location', con=engine).head()