In [1]:
import pandas as pd
from sqlalchemy import create_engine

## 1.1 Prepare Migration Dataset using pandas

### 1.1.1 Read the data

In [2]:
excel_file = "Resources/Turkey vultures in North and South America - migration.xlsx"
vulture_data_df = pd.read_excel(excel_file, index_col=None)
vulture_data_df.head(2)

Unnamed: 0,event-id,visible,timestamp,location-long,location-lat,manually-marked-outlier,sensor-type,individual-taxon-canonical-name,tag-local-identifier,individual-local-identifier,...,animal-life-stage,animal-mass,attachment-type,deployment-comments,deployment-id,duty-cycle,study-site,tag-manufacturer-name,tag-mass,tag-model
0,283203879,True,2003-11-14 16:00:00.000,-75.39717,40.48933,False,gps,Cathartes aura,42500,Butterball,...,adult,2372.0,harness,trapped in Pennsylvania using padded-leg hold ...,42500-Butterball,1 fix per hour,East Coast of North America,Microwave Telemetry,70,PTT100
1,283203880,True,2003-11-14 17:00:00.000,-75.39717,40.48933,False,gps,Cathartes aura,42500,Butterball,...,adult,2372.0,harness,trapped in Pennsylvania using padded-leg hold ...,42500-Butterball,1 fix per hour,East Coast of North America,Microwave Telemetry,70,PTT100


In [3]:
vulture_data_df.keys()
# vulture_data_df.columns

Index(['event-id', 'visible', 'timestamp', 'location-long', 'location-lat',
       'manually-marked-outlier', 'sensor-type',
       'individual-taxon-canonical-name', 'tag-local-identifier',
       'individual-local-identifier', 'study-name', 'utm-easting',
       'utm-northing', 'utm-zone', 'study-timezone', 'study-local-timestamp',
       'tag-id', 'animal-id', 'animal-taxon', 'deploy-on-date',
       'deploy-off-date', 'animal-comments', 'animal-life-stage',
       'animal-mass', 'attachment-type', 'deployment-comments',
       'deployment-id', 'duty-cycle', 'study-site', 'tag-manufacturer-name',
       'tag-mass', 'tag-model'],
      dtype='object')

### 1.1.2 Select columns, and change column names

In [4]:
# Select columns 
new_vulture_data_df = vulture_data_df[['event-id', 'timestamp', 'location-long', 'location-lat','individual-taxon-canonical-name', 'tag-local-identifier',
       'individual-local-identifier']].copy()
new_vulture_data_df.head()

Unnamed: 0,event-id,timestamp,location-long,location-lat,individual-taxon-canonical-name,tag-local-identifier,individual-local-identifier
0,283203879,2003-11-14 16:00:00.000,-75.39717,40.48933,Cathartes aura,42500,Butterball
1,283203880,2003-11-14 17:00:00.000,-75.39717,40.48933,Cathartes aura,42500,Butterball
2,283203881,2003-11-14 18:00:00.000,-75.33317,40.32467,Cathartes aura,42500,Butterball
3,283203882,2003-11-14 19:00:00.000,-75.35617,40.33983,Cathartes aura,42500,Butterball
4,283203883,2003-11-14 20:00:00.000,-75.4265,40.3155,Cathartes aura,42500,Butterball


In [5]:
#????? change column name ('-' is not compatible in a lot of system)
#also change the column name of the following so consisten with the other info datasets 'individual_taxon_canonical_name', 'tag_local_identifier','individual_local_identifier'to 'animal_taxon','tag_id', 'animal_id'

new_vulture_data_df.columns = ['event_id', 'timestamp', 'location_long', 'location_lat',
                               'animal_taxon', 'tag_id','animal_id']
new_vulture_data_df.head()

Unnamed: 0,event_id,timestamp,location_long,location_lat,animal_taxon,tag_id,animal_id
0,283203879,2003-11-14 16:00:00.000,-75.39717,40.48933,Cathartes aura,42500,Butterball
1,283203880,2003-11-14 17:00:00.000,-75.39717,40.48933,Cathartes aura,42500,Butterball
2,283203881,2003-11-14 18:00:00.000,-75.33317,40.32467,Cathartes aura,42500,Butterball
3,283203882,2003-11-14 19:00:00.000,-75.35617,40.33983,Cathartes aura,42500,Butterball
4,283203883,2003-11-14 20:00:00.000,-75.4265,40.3155,Cathartes aura,42500,Butterball


### 1.1.3 Clean DataFrame: drop NA, filter, drop duplicates

In [6]:
new_vulture_data_df.count()

event_id         220077
timestamp        220077
location_long    220077
location_lat     220077
animal_taxon     220077
tag_id           220077
animal_id        220077
dtype: int64

In [7]:
# drop rows without long and lat
new_vulture_data_df = new_vulture_data_df.dropna(how="any")
new_vulture_data_df.count()

event_id         220077
timestamp        220077
location_long    220077
location_lat     220077
animal_taxon     220077
tag_id           220077
animal_id        220077
dtype: int64

In [8]:
# filter data to only keep turkey vulture (Cathartes aura) data
new_vulture_data_df = new_vulture_data_df.loc[new_vulture_data_df
                                              ['animal_taxon'] == "Cathartes aura", :]
new_vulture_data_df.count()

event_id         220077
timestamp        220077
location_long    220077
location_lat     220077
animal_taxon     220077
tag_id           220077
animal_id        220077
dtype: int64

In [9]:
# new_vulture_data_df.set_index("event_id")

In [10]:
# drop rows with duplicate event_id
new_vulture_data_df = new_vulture_data_df.drop_duplicates(["event_id"], keep='first')
new_vulture_data_df.count()

event_id         215719
timestamp        215719
location_long    215719
location_lat     215719
animal_taxon     215719
tag_id           215719
animal_id        215719
dtype: int64

In [11]:
new_vulture_data_df.dtypes

event_id           int64
timestamp         object
location_long    float64
location_lat     float64
animal_taxon      object
tag_id             int64
animal_id         object
dtype: object

In [12]:
# check to see if tag_id or animal_id is unique
grouped_vulture_df = new_vulture_data_df.groupby(['tag_id','animal_id'])

In [13]:
grouped_vulture_df.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,event_id,timestamp,location_long,location_lat,animal_taxon
tag_id,animal_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42500,Butterball,1275,1275,1275,1275,1275
42500,Schaumboch,3083,3083,3083,3083,3083
52067,Irma,18314,18314,18314,18314,18314
52069,Disney,28578,28578,28578,28578,28578
53797,La Pampa,4032,4032,4032,4032,4032
53798,Whitey,3565,3565,3565,3565,3565
53800,Argentina,4058,4058,4058,4058,4058
57954,Prado,20967,20967,20967,20967,20967
57955,Sarkis,8451,8451,8451,8451,8451
57956,Rosalie,28101,28101,28101,28101,28101


## 1.2 Prepare Vulture Info Dataset using pandas

In [14]:
# Read data
csv_file = "Resources/Turkey vultures in North and South America-reference-data.csv"
vulture_info_df = pd.read_csv(csv_file, low_memory=False)
vulture_info_df.head()

Unnamed: 0,tag-id,animal-id,animal-taxon,deploy-on-date,deploy-off-date,animal-comments,animal-life-stage,animal-mass,attachment-type,deployment-comments,deployment-id,duty-cycle,study-site,tag-manufacturer-name,tag-mass,tag-model
0,42500,Butterball,Cathartes aura,2003-11-14 16:00:00.000,2004-03-14 20:00:01.000,migratory,adult,2372.0,harness,trapped in Pennsylvania using padded-leg hold ...,42500-Butterball,1 fix per hour,East Coast of North America,Microwave Telemetry,70.0,PTT100
1,52067,Irma,Cathartes aura,2004-09-06 17:00:00.000,2013-03-18 22:00:01.000,non-migratory,adult,2012.0,harness,trapped in Pennsylvania using padded-leg hold ...,52067-Irma,1 fix per hour,East Coast of North America,Microwave Telemetry,70.0,PTT100
2,42500,Schaumboch,Cathartes aura,2004-10-08 15:00:00.000,2006-03-29 17:00:01.000,migratory,adult,1951.0,harness,trapped in Pennsylvania using padded-leg hold ...,42500-Schaumboch,1 fix per hour,East Coast of North America,Microwave Telemetry,70.0,PTT100
3,52069,Disney,Cathartes aura,2004-10-11 14:00:00.000,2011-10-18 23:00:01.000,migratory,adult,2108.0,harness,trapped in Pennsylvania using padded-leg hold ...,52069-Disney,1 fix per hour,East Coast of North America,Microwave Telemetry,70.0,PTT100
4,57954,Prado,Cathartes aura,2005-11-02 15:00:00.000,2009-07-07 00:00:01.000,non-migratory,adult,1710.0,harness,trapped in California using walk-in traps,57954-Prado,1 fix per hour,West Coast of North America,Microwave Telemetry,70.0,PTT100


In [15]:
# Get column names
vulture_info_df.columns

Index(['tag-id', 'animal-id', 'animal-taxon', 'deploy-on-date',
       'deploy-off-date', 'animal-comments', 'animal-life-stage',
       'animal-mass', 'attachment-type', 'deployment-comments',
       'deployment-id', 'duty-cycle', 'study-site', 'tag-manufacturer-name',
       'tag-mass', 'tag-model'],
      dtype='object')

In [16]:
# Select columns 
new_vulture_info_df = vulture_info_df[['tag-id', 'animal-id', 'animal-taxon', 'deploy-on-date',
       'deploy-off-date', 'animal-comments', 'animal-life-stage',
       'animal-mass',  'deployment-comments',
        'study-site']].copy()
new_vulture_info_df.head(1)

Unnamed: 0,tag-id,animal-id,animal-taxon,deploy-on-date,deploy-off-date,animal-comments,animal-life-stage,animal-mass,deployment-comments,study-site
0,42500,Butterball,Cathartes aura,2003-11-14 16:00:00.000,2004-03-14 20:00:01.000,migratory,adult,2372.0,trapped in Pennsylvania using padded-leg hold ...,East Coast of North America


In [17]:
# Change column names ('-' to '_')
new_vulture_info_df.columns = ['tag_id', 'animal_id', 'animal_taxon', 'deploy_on_date',
       'deploy_off_date', 'animal_comments', 'animal_life_stage',
       'animal_mass',  'deployment_comments',
        'study_site']
new_vulture_info_df.head()

Unnamed: 0,tag_id,animal_id,animal_taxon,deploy_on_date,deploy_off_date,animal_comments,animal_life_stage,animal_mass,deployment_comments,study_site
0,42500,Butterball,Cathartes aura,2003-11-14 16:00:00.000,2004-03-14 20:00:01.000,migratory,adult,2372.0,trapped in Pennsylvania using padded-leg hold ...,East Coast of North America
1,52067,Irma,Cathartes aura,2004-09-06 17:00:00.000,2013-03-18 22:00:01.000,non-migratory,adult,2012.0,trapped in Pennsylvania using padded-leg hold ...,East Coast of North America
2,42500,Schaumboch,Cathartes aura,2004-10-08 15:00:00.000,2006-03-29 17:00:01.000,migratory,adult,1951.0,trapped in Pennsylvania using padded-leg hold ...,East Coast of North America
3,52069,Disney,Cathartes aura,2004-10-11 14:00:00.000,2011-10-18 23:00:01.000,migratory,adult,2108.0,trapped in Pennsylvania using padded-leg hold ...,East Coast of North America
4,57954,Prado,Cathartes aura,2005-11-02 15:00:00.000,2009-07-07 00:00:01.000,non-migratory,adult,1710.0,trapped in California using walk-in traps,West Coast of North America


In [18]:
new_vulture_info_df.count()

tag_id                 19
animal_id              19
animal_taxon           19
deploy_on_date         19
deploy_off_date        19
animal_comments        19
animal_life_stage      19
animal_mass            12
deployment_comments    19
study_site             19
dtype: int64

In [19]:
# drop rows with duplicate animal_id (tag_id is not unique based on groupby resulte above)
new_vulture_info_df = new_vulture_info_df.drop_duplicates(["animal_id"], keep='first')
new_vulture_info_df.head()

Unnamed: 0,tag_id,animal_id,animal_taxon,deploy_on_date,deploy_off_date,animal_comments,animal_life_stage,animal_mass,deployment_comments,study_site
0,42500,Butterball,Cathartes aura,2003-11-14 16:00:00.000,2004-03-14 20:00:01.000,migratory,adult,2372.0,trapped in Pennsylvania using padded-leg hold ...,East Coast of North America
1,52067,Irma,Cathartes aura,2004-09-06 17:00:00.000,2013-03-18 22:00:01.000,non-migratory,adult,2012.0,trapped in Pennsylvania using padded-leg hold ...,East Coast of North America
2,42500,Schaumboch,Cathartes aura,2004-10-08 15:00:00.000,2006-03-29 17:00:01.000,migratory,adult,1951.0,trapped in Pennsylvania using padded-leg hold ...,East Coast of North America
3,52069,Disney,Cathartes aura,2004-10-11 14:00:00.000,2011-10-18 23:00:01.000,migratory,adult,2108.0,trapped in Pennsylvania using padded-leg hold ...,East Coast of North America
4,57954,Prado,Cathartes aura,2005-11-02 15:00:00.000,2009-07-07 00:00:01.000,non-migratory,adult,1710.0,trapped in California using walk-in traps,West Coast of North America


In [20]:
new_vulture_info_df.count()

tag_id                 19
animal_id              19
animal_taxon           19
deploy_on_date         19
deploy_off_date        19
animal_comments        19
animal_life_stage      19
animal_mass            12
deployment_comments    19
study_site             19
dtype: int64

In [21]:
new_vulture_info_df.dtypes

tag_id                   int64
animal_id               object
animal_taxon            object
deploy_on_date          object
deploy_off_date         object
animal_comments         object
animal_life_stage       object
animal_mass            float64
deployment_comments     object
study_site              object
dtype: object

## 1.3 Load to DB

### 1.3.1 Connect to local database

In [22]:
database_path = "vulture_etl"
engine = create_engine(f"sqlite:///{database_path}")

### 1.3.2 Check for tables and creat new tables

In [23]:
engine.table_names()

[]

In [24]:
# ??? SQLite supports a limited subset of ALTER TABLE, add primary keys (https://stackoverflow.com/questions/969187/altering-sqlite-column-type-and-adding-pk-constraint)
# trying to create a table with primary key then load it. But the load part (pd.to_sql) keep giving error
engine.execute('CREATE TABLE IF NOT EXISTS "migration_paths" ('
               'event_id INTEGER NOT NULL,'
               'timestamp TEXT,'
               'location_long REAL,'
               'location_lat REAL,'
               'animal_taxon TEXT,'
               'tag_id INTEGER,'
               'animal_id TEXT,'
               'PRIMARY KEY (event_id));')

engine.execute('CREATE TABLE IF NOT EXISTS "vulture_detail" ('
               'tag_id INTEGER NOT NULL,'
               'animal_id TEXT,'
               'animal_taxon TEXT,'
               'deploy_on_date TEXT,'
               'deploy_off_date TEXT,'
               'animal_comments TEXT,'
               'animal_life_stage TEXT,' 
               'animal_mass REAL,'
               'deployment_comments TEXT,'
               'study_site TEXT,'
               'PRIMARY KEY (animal_id));')

#??? set foreign key

<sqlalchemy.engine.result.ResultProxy at 0x178a7e86390>

### 1.3.3 Use pandas to load csv converted DataFrames (migration_paths & vulture_detail) into database

In [25]:
pd.read_sql_query('select * from migration_paths', con=engine).count()

event_id         0
timestamp        0
location_long    0
location_lat     0
animal_taxon     0
tag_id           0
animal_id        0
dtype: int64

In [26]:
pd.read_sql_query('select * from vulture_detail', con=engine).count()

tag_id                 0
animal_id              0
animal_taxon           0
deploy_on_date         0
deploy_off_date        0
animal_comments        0
animal_life_stage      0
animal_mass            0
deployment_comments    0
study_site             0
dtype: int64

In [27]:
new_vulture_data_df.to_sql(name='migration_paths', con=engine, if_exists='append',index=False)
pd.read_sql_query('select * from migration_paths', con=engine).head()


# try:
#     new_vulture_data_df.to_sql(name='migration_paths', con=engine, if_exists='append',index=False)
#     pd.read_sql_query('select * from migration_paths', con=engine).head()
# except IntegrityError:
#     print("Duplicate loading or same primary key alread exists")
    
    
#     except IntegrityError:
#        pass

# for row in csvfile:
#    try:
#        cursor.execute('INSERT INTO X (Y) VALUES (%s)' % row[rowdatapoint])
#    except IntegrityError:
#        pass

# sql.execute("INSERT OR IGNORE INTO foo(data) VALUES(?);" ("Some text.", ))
# engine.execute(INSERT OR IGNORE INTO 'migration_paths' ('event_id', 'timestamp', 'location_long', 'location_lat',
#     'animal_taxon', 'tag_id','animal_id') VALUES (?, ?, ?, ?, ?, ?, ?), new_vulture_data_df)

# engine.execute('INSERT OR IGNORE INTO migration_paths (event_id, timestamp, location_long, location_lat, animal_taxon, tag_id, animal_id) VALUES (?, ?, ?, ?, ?, ?, ?), new_vulture_data_df')

Unnamed: 0,event_id,timestamp,location_long,location_lat,animal_taxon,tag_id,animal_id
0,283038789,2004-09-06 17:00:00.000,-75.28533,40.778,Cathartes aura,52067,Irma
1,283038790,2004-09-06 18:00:00.000,-75.28533,40.77817,Cathartes aura,52067,Irma
2,283038791,2004-09-06 19:00:00.000,-75.28933,40.77433,Cathartes aura,52067,Irma
3,283038792,2004-09-06 20:00:00.000,-75.289,40.77433,Cathartes aura,52067,Irma
4,283038793,2004-09-07 00:00:00.000,-75.289,40.77417,Cathartes aura,52067,Irma


In [28]:
new_vulture_info_df.to_sql(name='vulture_detail', con=engine, if_exists='append',index=False)
pd.read_sql_query('select * from vulture_detail', con=engine).head()

Unnamed: 0,tag_id,animal_id,animal_taxon,deploy_on_date,deploy_off_date,animal_comments,animal_life_stage,animal_mass,deployment_comments,study_site
0,42500,Butterball,Cathartes aura,2003-11-14 16:00:00.000,2004-03-14 20:00:01.000,migratory,adult,2372.0,trapped in Pennsylvania using padded-leg hold ...,East Coast of North America
1,52067,Irma,Cathartes aura,2004-09-06 17:00:00.000,2013-03-18 22:00:01.000,non-migratory,adult,2012.0,trapped in Pennsylvania using padded-leg hold ...,East Coast of North America
2,42500,Schaumboch,Cathartes aura,2004-10-08 15:00:00.000,2006-03-29 17:00:01.000,migratory,adult,1951.0,trapped in Pennsylvania using padded-leg hold ...,East Coast of North America
3,52069,Disney,Cathartes aura,2004-10-11 14:00:00.000,2011-10-18 23:00:01.000,migratory,adult,2108.0,trapped in Pennsylvania using padded-leg hold ...,East Coast of North America
4,57954,Prado,Cathartes aura,2005-11-02 15:00:00.000,2009-07-07 00:00:01.000,non-migratory,adult,1710.0,trapped in California using walk-in traps,West Coast of North America


### Confirm data has been added by querying the table head and count

In [29]:
pd.read_sql_query('select * from migration_paths', con=engine).count()

event_id         215719
timestamp        215719
location_long    215719
location_lat     215719
animal_taxon     215719
tag_id           215719
animal_id        215719
dtype: int64

In [30]:
pd.read_sql_query('select * from vulture_detail', con=engine).count()

tag_id                 19
animal_id              19
animal_taxon           19
deploy_on_date         19
deploy_off_date        19
animal_comments        19
animal_life_stage      19
animal_mass            12
deployment_comments    19
study_site             19
dtype: int64

# list of cities

In [31]:
from citipy import citipy


In [34]:
lats = new_vulture_data_df['location_lat'].values.tolist()
lngs = new_vulture_data_df['location_long'].values.tolist()
type(lngs)

list

In [35]:
lat_lngs = []
cities = []

In [36]:
lat_lngs = zip(lats, lngs)

# Identify nearest city for each lat, lng combination
for lat_lng in lat_lngs:
    city = citipy.nearest_city(lat_lng[0], lat_lng[1]).city_name
    cities.append(city)



In [37]:
len(cities)
cities[-1]

'pottsville'

In [39]:
cities_df = new_vulture_data_df[['event_id', 'location_lat', 'location_long']]
cities_df["cities"] = pd.DataFrame({"cities" : cities})
cities_df[['event_id', 'location_lat', 'location_long', 'cities']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,event_id,location_lat,location_long,cities
0,283203879,40.48933,-75.39717,emmaus
1,283203880,40.48933,-75.39717,emmaus
2,283203881,40.32467,-75.33317,lansdale
3,283203882,40.33983,-75.35617,lansdale
4,283203883,40.31550,-75.42650,lansdale
5,283203884,40.30383,-75.40500,lansdale
6,283203885,40.29567,-75.41133,lansdale
7,283203886,40.29567,-75.41167,lansdale
8,283203887,40.29600,-75.41183,lansdale
9,283203888,40.29583,-75.41167,lansdale
