In [1]:
import pandas as pd
from sqlalchemy import create_engine

# 1. Creat the initial DB (clean and load)

## 1.1 Prepare Migration Dataset using pandas

### 1.1.1 Read the data

In [None]:
excel_file = "Resources/Turkey vultures in North and South America - migration.xlsx"
vulture_data_df = pd.read_excel(excel_file, index_col=None)
vulture_data_df.head(2)

In [None]:
vulture_data_df.keys()
# vulture_data_df.columns

### 1.1.2 Select columns, and change column names

In [None]:
# Select columns 
new_vulture_data_df = vulture_data_df[['event-id', 'timestamp', 'location-long', 'location-lat','individual-taxon-canonical-name', 'tag-local-identifier',
       'individual-local-identifier']].copy()
new_vulture_data_df.head()

In [None]:
#????? change column name ('-' is not compatible in a lot of system)
#also change the column name of the following so consisten with the other info datasets 'individual_taxon_canonical_name', 'tag_local_identifier','individual_local_identifier'to 'animal_taxon','tag_id', 'animal_id'

new_vulture_data_df.columns = ['event_id', 'timestamp', 'location_long', 'location_lat',
                               'animal_taxon', 'tag_id','animal_id']
new_vulture_data_df.head()

### 1.1.3 Clean DataFrame: drop NA, filter

In [None]:
new_vulture_data_df.count()

In [None]:
# drop rows without long and lat
new_vulture_data_df = new_vulture_data_df.dropna(how="any")
new_vulture_data_df.count()

In [None]:
# filter data to only keep turkey vulture (Cathartes aura) data
new_vulture_data_df = new_vulture_data_df.loc[new_vulture_data_df
                                              ['animal_taxon'] == "Cathartes aura", :]
new_vulture_data_df.count()

In [None]:
# new_vulture_data_df.set_index("event_id")

In [None]:
new_vulture_data_df.dtypes

## 1.2 Prepare Vulture Info Dataset using pandas

In [None]:
# Read data
csv_file = "Resources/Turkey vultures in North and South America-reference-data.csv"
vulture_info_df = pd.read_csv(csv_file, low_memory=False)
vulture_info_df.head()

In [None]:
# Get column names
vulture_info_df.columns

In [None]:
# Select columns 
new_vulture_info_df = vulture_info_df[['tag-id', 'animal-id', 'animal-taxon', 'deploy-on-date',
       'deploy-off-date', 'animal-comments', 'animal-life-stage',
       'animal-mass',  'deployment-comments',
        'study-site']].copy()
new_vulture_info_df.head(1)

In [None]:
# Change column names ('-' to '_')
new_vulture_info_df.columns = ['tag_id', 'animal_id', 'animal_taxon', 'deploy_on_date',
       'deploy_off_date', 'animal_comments', 'animal_life_stage',
       'animal_mass',  'deployment_comments',
        'study_site']
new_vulture_data_df.head()

In [None]:
new_vulture_info_df.count()

## 1.3 Create and Load to DB

### 1.3.1 Connect to local database

In [None]:
database_path = "vulture_etl"
engine = create_engine(f"sqlite:///{database_path}")

### 1.3.2 Check for tables and creat new tables

In [None]:
engine.table_names()

# ??? SQLite supports a limited subset of ALTER TABLE, add primary keys (https://stackoverflow.com/questions/969187/altering-sqlite-column-type-and-adding-pk-constraint)
# trying to create a table with primary key then load it. But the load part (pd.to_sql) keep giving error

# engine.execute('CREATE TABLE IF NOT EXISTS "migration_paths" ('
#                'event_id INTEGER NOT NULL,'
#                'timestamp VARCHAR,'
#                'location_long DECIMAL(3,5),'
#                'location_lat DECIMAL(3,5),'
#                'individual_taxon_canonical_name VARCHAR,'
#                'tag_local_identifier INTEGER NOT NULL,'
#                'individual_local_identifier VARCHAR,'
#                'UNIQUE (event_id),'
#                'PRIMARY KEY (event_id));')

# pd.read_sql_query('select * from migration_paths', con=engine).head()



# ????if else to prevent build twice
#??? set foreign key

### 1.3.3 Use pandas to load csv converted DataFrame into database

In [None]:
new_vulture_data_df.to_sql(name='migration_paths', con=engine, if_exists='append',index=False)

In [None]:
new_vulture_info_df.to_sql(name='vulture_detail', con=engine, if_exists='append', index=False)

In [None]:
# pd.read_sql_query('SELECT sql FROM sqlite_master WHERE name='migration_paths')
                

In [None]:

engine.execute('alter table migration_paths add primary key('event-id')')

In [None]:
# with engine.connect() as con:
#     con.execute('ALTER TABLE `migration_paths` ADD PRIMARY KEY (`event-id`);')
    
# con = sqlalchemy.create_engine(url, client_encoding='utf8')
engine.execute('alter table migration_paths add primary (`event-id`);')

In [None]:
pd.read_sql_query('select * from migration_paths', con=engine).head()

In [None]:
pd.read_sql_query('select * from migration_paths', con=engine).count()

# 2. Load New Data from another Source (Acopian Center)

## 2.1 Migration path data

### 2.1.1 Read the data

In [None]:
csv_file = "Resources/Vultures_Acopian_Center_USA_2003-2016.csv"
vulture_data_df = pd.read_csv(csv_file, index_col=None, low_memory=False)
vulture_data_df.head(2)

In [None]:
vulture_data_df.keys()
# vulture_data_df.columns

## 2.1.2 Select columns, and change column names

In [None]:
# Select columns 
new_vulture_data_df = vulture_data_df[['event-id', 'timestamp', 'location-long', 'location-lat','individual-taxon-canonical-name', 'tag-local-identifier',
       'individual-local-identifier']].copy()
new_vulture_data_df.head()

In [None]:
#????? change column name ('-' is not compatible in a lot of system)
#also change the column name of the following so consisten with the other info datasets 'individual_taxon_canonical_name', 'tag_local_identifier','individual_local_identifier'to 'animal_taxon','tag_id', 'animal_id'

new_vulture_data_df.columns = ['event_id', 'timestamp', 'location_long', 'location_lat',
                               'animal_taxon', 'tag_id','animal_id']
new_vulture_data_df.head()

### 2.1.3 Clean DataFrame: drop NA, filter

In [None]:
new_vulture_data_df.count()

In [None]:
# drop rows without long and lat
new_vulture_data_df = new_vulture_data_df.dropna(how="any")
new_vulture_data_df.count()

In [None]:
# filter data to only keep turkey vulture (Cathartes aura) data
new_vulture_data_df = new_vulture_data_df.loc[new_vulture_data_df
                                              ['animal_taxon'] == "Cathartes aura", :]
new_vulture_data_df.count()

In [None]:
# new_vulture_data_df.set_index("event_id")

In [None]:
new_vulture_data_df.dtypes

### Confirm data has been added by querying the customer_name table
* NOTE: can also check using pgAdmin

In [None]:
pd.read_sql_query('select * from migration_paths', con=engine).head()

### Confirm data has been added by querying the customer_location table

In [None]:
pd.read_sql_query('select * from vulture_detail', con=engine).head()

In [None]:
engine.table_names()

In [None]:
conn = engine.connect()

from sqlalchemy.orm import Session
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

session = Session(bind=engine)
Base.metadata.create_all(engine)

In [None]:

session.commit()

In [None]:
names = session.query(migration_paths)
for name in names:
    
    print(migration_paths.animal-id)

## Data from Acopian Center

# list of cities

In [None]:
from citipy import citipy


In [None]:
lats = new_vulture_data_df['location-lat'].values.tolist()
lngs = new_vulture_data_df['location-long'].values.tolist()
type(lngs)

In [None]:
lat_lngs = []
cities = []

In [None]:
lat_lngs = zip(lats, lngs)

# Identify nearest city for each lat, lng combination
for lat_lng in lat_lngs:
    city = citipy.nearest_city(lat_lng[0], lat_lng[1]).city_name
    cities.append(city)



In [None]:
len(cities)
cities[-1]

In [None]:
cities_df = new_vulture_data_df[['event-id', 'location-lat', 'location-long']]
cities_df["cities"] = pd.DataFrame({"cities" : cities})
cities_df[['event-id', 'location-lat', 'location-long', 'cities']]
