# Import libraries, set options, connect to DB

In [14]:
# Configuration code for datawrangling
import pandas as pd
import os
import numpy as np
from datetime import datetime
from geocode import geocode
import mapToPoly
from mapToPoly import mapToPoly
pd.set_option('display.max_row', 30000)
import csv

# Configuration code in order to connect to the database
from sqlalchemy import create_engine, exists
from sqlalchemy.orm import sessionmaker
from database_setup import Itenerary, Base

passWord = os.environ['my_password']
# This commented out one was how I connected to the remote database
# DATABASE_URI = 'postgres://maxcarey:' + passWord + '@totago.cqfm37jhmjmk.ap-southeast-2.rds.amazonaws.com:5432/totago'
DATABASE_URI = 'postgres+psycopg2://maxcarey:' + passWord + '@localhost:5432/totago'
engine = create_engine(DATABASE_URI)

#engine = create_engine('sqlite:///totagoData.db')

# Bind the engine to the metadata of the Base class so that the
# declaratives can be accessed through a DBSession instance
Base.metadata.bind = engine

DBSession = sessionmaker(bind=engine)

session = DBSession()

# Read in data as pandas data frame, selecting only certain fields

In [15]:
fields = ['distinct_id', 'numItinerariesReturned', 'departureDate', 'startFromLocation', 'selectedDestination_id', 'selectedDestination_name', 'time', 'user_id']

In [16]:
df = pd.read_csv('modified_iten.csv', usecols = fields, dtype={"selectedDestination_id" : "str"})

# Wrange field: destinationIDs

In [17]:
# Replace all of the NAs for destinationIDs with 0
df.selectedDestination_id.fillna(0, inplace = True)


## Remove the 2 cases where the string says null
## Great tutorial here: https://www.youtube.com/watch?v=2AFGPdNn4FM
df = df[df.selectedDestination_id != 'null']

## Convert destinationIDs column to an integer value
df['selectedDestination_id'] = df.selectedDestination_id.astype(int)






# Wrangle field: numItenerariesReturned

In [18]:
# Replace all of the NAs for numItinerariesReturned with 1
df.numItinerariesReturned.fillna(1, inplace = True)

# Convert from float to integer
df['numItinerariesReturned'] = df.numItinerariesReturned.astype(int)

# Select, only observatiosn where this field is greater than 0 (now that the NAs are gone)


# Wrangle Field: Destination Name

In [19]:
#Convert this field to an integer replacing all NA's with zero
# This gets rid of the trailing zeros
df.selectedDestination_name.fillna("", inplace = True)

# Wrangle Field: departureDate

In [20]:
#Convert destinationIDs column to an integer value
# It looks like there were some complex rows being held in here before, I thought that when df.dtypes returned object that
# meant string but apprently not
df['departureDate'] = df.departureDate.astype(str)

print("number of rows before removal of anamoulous departureDate cases")
print(len(df))

# IT looks like there are some cases where this field is blank, says nan, is in format 24503, or in format "masked" 
# We need to remove these cases from the data frame
# I can see that some blank rows are still printed out.
df = df[df.departureDate != '']
df = df[df.departureDate != 'nan']
df = df[df.departureDate != '24503']
df = df[df.departureDate != '[masked]']

print("number of rows after removal of anamoulous departureDate cases")
print(len(df))

# Create a function extractDate that extracts the first ten characters of an input string
def extractDate(dateString):
    extractedDate = dateString[0:10]
    if len(extractedDate) < 10:
        print(extractedDate)
    return extractedDate

''' Code to test if the extractDate function works

# Apply this function to create  a new column
df['departureDateFixed'] = df.departureDate.apply(extractDate)

cols = ['distinct_id', 'departureDate', 'departureDateFixed', 'numItinerariesReturned', 'selectedDestination_id', 'selectedDestination_name', 'startFromLocation']

df = df[cols]
'''

# Override departure date extracting all of the null time stamps
df['departureDate'] = df.departureDate.apply(extractDate)


# Convert departure date into a time object in pandas
#See here: https://stackoverflow.com/questions/26763344/convert-pandas-column-to-datetime
# Though this actually might not need to be done
#df['departureDate'] = df.departureDate.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))




#df['departureDate'] = datetime.strptime(df['departureDate'], '%Y-%m-%-d')  
#df['departureDate'] = pd.to_datetime(df['departureDate'], format = '%Y-%m-%-d')



number of rows before removal of anamoulous departureDate cases
35772
number of rows after removal of anamoulous departureDate cases
35764


# Wrangle Field: distinctID

In [21]:
# Create a coloumn that combines the unix time stamp with distinct_id so that we have a primary key for database
df["primary_key"] = df["distinct_id"] + "-" + df["time"].map(str)
vc = df.primary_key.value_counts()
unique_keys = df.primary_key.unique()

# Wrangle user_id field

In [22]:
df.dtypes

departureDate                object
distinct_id                  object
numItinerariesReturned        int64
selectedDestination_id        int64
selectedDestination_name     object
startFromLocation            object
time                          int64
user_id                     float64
primary_key                  object
dtype: object

In [23]:
# This gets rid of the railing zeros and all of the nas are just
# blank

df['user_id'] = df['user_id'].fillna(0).astype(np.int64)

#Convert to string to be consistent with other fields in database
df['user_id'] = df['user_id'].astype(str)


# Create a subset of the datle with sample method to test geocode and database entry logic

In [24]:
# Create a random sample of the database, these entries will be added to the database in the next section
sampleDf = df.tail(500)

# Output this random sample
sampleDf.head(len(sampleDf))

Unnamed: 0,departureDate,distinct_id,numItinerariesReturned,selectedDestination_id,selectedDestination_name,startFromLocation,time,user_id,primary_key
35272,2019-08-25,16cc9a577f022c-06c8c30d9e489b-241e7926-49a10-1...,1,55,Big Cedar and Kennedy Falls,"4170 Nanaimo Street, Vancouver, British Columb...",1566725997,1,16cc9a577f022c-06c8c30d9e489b-241e7926-49a10-1...
35273,2019-09-01,16cc9a033c76a8-066d5420024f4b-7373e61-1fa400-1...,1,55,Big Cedar and Kennedy Falls,"Broadway - City Hall SkyTrain Station, 490 Wes...",1566726031,1,16cc9a033c76a8-066d5420024f4b-7373e61-1fa400-1...
35274,2019-08-25,16ad3094c52134-06f3f479aa20d3-353166-1fa400-16...,1,137,Lions Bay Loop,"Lonsdale Quay Market, 123 Carrie Cates Court, ...",1566727284,0,16ad3094c52134-06f3f479aa20d3-353166-1fa400-16...
35275,2019-08-25,16cc68c300425d-0407dc47305423-7f642d12-3d10d-1...,1,125,Cypress Falls,"1646 Frances Street, Vancouver, British Columb...",1566728642,0,16cc68c300425d-0407dc47305423-7f642d12-3d10d-1...
35276,2019-08-25,16cc68c300425d-0407dc47305423-7f642d12-3d10d-1...,1,125,Cypress Falls,"1646 Frances Street, Vancouver, British Columb...",1566729609,0,16cc68c300425d-0407dc47305423-7f642d12-3d10d-1...
35277,2019-08-25,16cc7b01484580-029ca6e019b7ba8-7f642d12-5a900-...,1,136,Lighthouse Park,"3347 West 8th Avenue, Vancouver, British Colum...",1566730042,0,16cc7b01484580-029ca6e019b7ba8-7f642d12-5a900-...
35278,2019-08-25,16cc9e5b83ea5-0fed81069a00528-3a515f22-3d10d-1...,1,139,Lynn Loop,"49.26031865282362,-123.05917879192539",1566730190,0,16cc9e5b83ea5-0fed81069a00528-3a515f22-3d10d-1...
35279,2019-08-25,16cc9e5b83ea5-0fed81069a00528-3a515f22-3d10d-1...,1,139,Lynn Loop,"49.26031865282362,-123.05917879192539",1566730302,0,16cc9e5b83ea5-0fed81069a00528-3a515f22-3d10d-1...
35280,2019-08-25,16cc066511daf8-027a1e822523d58-49183400-4b9600...,1,1154,Little Si,Residence Inn by Marriott Seattle Downtown/Con...,1566731649,827,16cc066511daf8-027a1e822523d58-49183400-4b9600...
35281,2019-09-08,16cc541e74a174-0af3b5bd2978dc-7373e61-e1000-16...,1,131,Grouse Grind,"Inn at the Quay, 900 Quayside Drive, New Westm...",1566733185,0,16cc541e74a174-0af3b5bd2978dc-7373e61-e1000-16...


## Read in the destination data to allow the possibility to pull the correct names

 


In [25]:
sampleDf.dtypes

departureDate               object
distinct_id                 object
numItinerariesReturned       int64
selectedDestination_id       int64
selectedDestination_name    object
startFromLocation           object
time                         int64
user_id                     object
primary_key                 object
dtype: object

# Loop through the rows in the dataframe, geocode, add entry to database

In [None]:
# Loop through the subsetted pandas data frame

# Uncomment the code below to loop through the the sample data frame
# for index, row in sampleDf.iterrows():

for index, row in sampleDf.iterrows():
  

    # Pull out the primary key into a variable
    testKey = row["primary_key"]
    
    # Check to see if that distinctID is in the data base
    # See this post: https://stackoverflow.com/questions/6587879/how-to-elegantly-check-the-existence-of-an-object-instance-variable-and-simultan?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
    entryExists = session.query(exists().where(Itenerary.distinctkey==testKey)).scalar()

    # If the entry is not in the database
    if not entryExists:
    
        # Get the string to be geocoded
        locationToGeocode = row["startFromLocation"]

        # Try to run the geocode function that returns a dictionary of information
        try:
            geocodeInfo = geocode(locationToGeocode)
            # If geocoding works, set valid to tre
            valid = True

        # If the geocode function doesn't work set valid to false
        except:
            valid = False 

        # If valid is true create a database entry with information from the dataframe, and the returned geocode informaiton
        if valid:
            
            # Sometimes, such as when a generic city is sent to the geocode() function a geometric center
            # is returned, this means there is no postal code
            
            # In this case, we can set the postalCode to none
            if not 'postalCode' in geocodeInfo:
                geocodeInfo['postalCode'] = "none"

            
            # Mapp the gps coordinates returned to the zip code polygons
            zipCodeInfo = mapToPoly(geocodeInfo['lat'], geocodeInfo['lng'], 'postal')
            
            
            if zipCodeInfo:
                zipCodeMapped = zipCodeInfo[0]
                region = zipCodeInfo[1]
            else:
                zipCodeMapped = 'outsideRegion'
                zipCodeMapped = 'outsideRegion'
            
            barrioInfo = mapToPoly(geocodeInfo['lat'], geocodeInfo['lng'], 'barrio')
            print(barrioInfo)
            
            if barrioInfo:
                barrioMapped = barrioInfo
            else:
                barrioMapped = 'outsideRegion'
                
            ## Get selected Destination Names
            # Pull the selected destination name
            selectedDestinationName = row["selectedDestination_name"]
            
            
            #if not selectedDestinationName:
            #    
            #    key = str(row["selectedDestination_id"])
            #    
            #    if key in destinations:
            #
            #        # Pull the data out from the dictionary that was created in the cell above
            #        newName = destinations[str(row["selectedDestination_id"])]['name']
        #
            #        # Add the new name to the new row
            #        selectedDestinationName  = newName
            #
            #    # In the case that there is destination that corresponds mark
            #    else:
            #    
            #        # TODO: CONSIDER CHANGING THE NAME OF THIS TO SOMETHING ELSE
            #        selectedDestinationName = "DELETED"
            #        # And overwrite valid to false at this point because there is no destination
            #        valid = False
            
            databaseEntry = Itenerary(distinctkey=row["primary_key"],
                                      numberitinerariesreturned=row["numItinerariesReturned"],
                                      selecteddestination_id=row["selectedDestination_id"],
                                      selecteddestination_name=selectedDestinationName,
                                      startfromlocation=row["startFromLocation"],
                                      departuredate=row["departureDate"],
                                      # Get data from python dictionary returned from geocode() function
                                      formatted_address=geocodeInfo['formatted_address'],
                                      lat=geocodeInfo['lat'],
                                      lng=geocodeInfo['lng'],
                                      postalcode=geocodeInfo['postalCode'],
                                      postalcodemapped=zipCodeMapped,
                                      barriomapped=barrioMapped,
                                      userid=row["user_id"],
                                      region=region,
                                      valid=valid)
        # If valid is false, just fill in the information that we have from the pandas data frame
        else:
            databaseEntry = Itenerary(distinctkey=row["primary_key"],
                                      numberitinerariesreturned=row["numItinerariesReturned"],
                                      selecteddestination_id=row["selectedDestination_id"],
                                      selecteddestination_name=row["selectedDestination_name"],
                                      startfromlocation=row["startFromLocation"],
                                      departuredate=row["departureDate"],
                                      userid=row["user_id"],
                                      valid=valid)

        # Add the the information to a database.    
        session.add(databaseEntry)
        session.commit()
    
    else:
        print("Entry already inside database")

{'formatted_address': '4170 Nanaimo St, Vancouver, BC V5R 1K6, Canada', 'lat': 49.2475892, 'lng': -123.0561743, 'postalCode': 'V5R 1K6'}
RC
RC
{'formatted_address': 'Broadway-City Hall, Vancouver, BC V5Z 2V2, Canada', 'lat': 49.26284010000001, 'lng': -123.1145156, 'postalCode': 'V5Z 2V2'}
MP
MP
{'formatted_address': '123 Carrie Cates Ct, North Vancouver, BC V7M 3K7, Canada', 'lat': 49.3103686, 'lng': -123.0818115, 'postalCode': 'V7M 3K7'}
None
{'formatted_address': '1646 Frances St, Vancouver, BC V5L 1Z4, Canada', 'lat': 49.2792257, 'lng': -123.0717029, 'postalCode': 'V5L 1Z4'}
GW
GW
{'formatted_address': '1646 Frances St, Vancouver, BC V5L 1Z4, Canada', 'lat': 49.2792257, 'lng': -123.0717029, 'postalCode': 'V5L 1Z4'}
GW
GW
{'formatted_address': '3347 W 8th Ave, Vancouver, BC V6R 1Y3, Canada', 'lat': 49.26526310000001, 'lng': -123.1789923, 'postalCode': 'V6R 1Y3'}
KITS
KITS
{'formatted_address': '2232 E 11th Ave, Vancouver, BC V5N 1Z6, Canada', 'lat': 49.2602857, 'lng': -123.0592451, '

CBD
CBD
{'formatted_address': '40 Powell St, Vancouver, BC V6A 1E7, Canada', 'lat': 49.2831758, 'lng': -123.1032851, 'postalCode': 'V6A 1E7'}
CBD
CBD
{'formatted_address': '14505 84 Ave, Surrey, BC V3S 8X2, Canada', 'lat': 49.1568044, 'lng': -122.8202605, 'postalCode': 'V3S 8X2'}
None
{'formatted_address': 'Surrey, BC V3S 9K1, Canada', 'lat': 49.1546114, 'lng': -122.8172354, 'postalCode': 'V3S 9K1'}
None
{'formatted_address': 'Surrey, BC V3S 9K1, Canada', 'lat': 49.1546114, 'lng': -122.8172354, 'postalCode': 'V3S 9K1'}
None
{'formatted_address': 'Surrey, BC V3S 9K1, Canada', 'lat': 49.1546114, 'lng': -122.8172354, 'postalCode': 'V3S 9K1'}
None
{'formatted_address': 'Surrey, BC V3S 9K1, Canada', 'lat': 49.1546114, 'lng': -122.8172354, 'postalCode': 'V3S 9K1'}
None
{'formatted_address': '9855 Austin Ave, Burnaby, BC V3J 1N4, Canada', 'lat': 49.2502266, 'lng': -122.8958023, 'postalCode': 'V3J 1N4'}
None
{'formatted_address': 'Vancouver, BC V7Y 1C6, Canada', 'lat': 49.2830972, 'lng': -123.

{'formatted_address': 'Seattle, WA 98112, USA', 'lat': 47.6329523, 'lng': -122.2891887, 'postalCode': '98112'}
250149
250149
250149
{'formatted_address': 'Seattle, WA 98112, USA', 'lat': 47.6329523, 'lng': -122.2891887, 'postalCode': '98112'}
250149
250149
250149
{'formatted_address': 'Seattle, WA 98112, USA', 'lat': 47.6329523, 'lng': -122.2891887, 'postalCode': '98112'}
250149
250149
250149
{'formatted_address': '4219 S Othello St, Seattle, WA 98118, USA', 'lat': 47.5364491, 'lng': -122.2800925, 'postalCode': '98118'}
250146
250146
250146
{'formatted_address': '360 W Georgia St, Vancouver, BC V6B 6B2, Canada', 'lat': 49.2797729, 'lng': -123.1155495, 'postalCode': 'V6B 6B2'}
CBD
CBD
{'formatted_address': '2124 4th Ave, Seattle, WA 98121, USA', 'lat': 47.6145977, 'lng': -122.3421704, 'postalCode': '98121'}
271808
271808
271808
{'formatted_address': '1601 5th Ave, Seattle, WA 98101, USA', 'lat': 47.612441, 'lng': -122.337463, 'postalCode': '98101'}
271849
271849
271849
{'formatted_addre

None
{'formatted_address': 'Mercer Island, WA 98040, USA', 'lat': 47.5706548, 'lng': -122.2220674, 'postalCode': '98040'}
None
{'formatted_address': 'Mercer Island, WA 98040, USA', 'lat': 47.5706548, 'lng': -122.2220674, 'postalCode': '98040'}
None
{'formatted_address': 'Mercer Island, WA 98040, USA', 'lat': 47.5706548, 'lng': -122.2220674, 'postalCode': '98040'}
None
{'formatted_address': 'Mercer Island, WA 98040, USA', 'lat': 47.5706548, 'lng': -122.2220674, 'postalCode': '98040'}
None
{'formatted_address': 'Mercer Island, WA 98040, USA', 'lat': 47.5706548, 'lng': -122.2220674, 'postalCode': '98040'}
None
{'formatted_address': 'Mercer Island, WA 98040, USA', 'lat': 47.5706548, 'lng': -122.2220674, 'postalCode': '98040'}
None
{'formatted_address': '1505 E Jefferson St, Seattle, WA 98122, USA', 'lat': 47.605939, 'lng': -122.312454, 'postalCode': '98122'}
271921
271921
271921
{'formatted_address': '1505 E Jefferson St, Seattle, WA 98122, USA', 'lat': 47.605939, 'lng': -122.312454, 'post