# Import libraries, set options, connect to DB

In [118]:
# Configuration code for datawrangling
import pandas as pd
import os
import numpy as np
from datetime import datetime
from geocode import geocode
import mapToPoly
from mapToPoly import mapToPoly
pd.set_option('display.max_row', 30000)
import csv

# Configuration code in order to connect to the database
from sqlalchemy import create_engine, exists
from sqlalchemy.orm import sessionmaker
from database_setup import Itenerary, Base

passWord = os.environ['my_password']
# This commented out one was how I connected to the remote database
# DATABASE_URI = 'postgres://maxcarey:' + passWord + '@totago.cqfm37jhmjmk.ap-southeast-2.rds.amazonaws.com:5432/totago'
DATABASE_URI = 'postgres+psycopg2://maxcarey:' + passWord + '@localhost:5432/totago'
engine = create_engine(DATABASE_URI)

#engine = create_engine('sqlite:///totagoData.db')

# Bind the engine to the metadata of the Base class so that the
# declaratives can be accessed through a DBSession instance
Base.metadata.bind = engine

DBSession = sessionmaker(bind=engine)

session = DBSession()

# Read in data as pandas data frame, selecting only certain fields

In [119]:
fields = ['distinct_id', 'numItinerariesReturned', 'departureDate', 'startFromLocation', 'selectedDestination_id', 'selectedDestination_name', 'time', 'user_id']

In [120]:
df = pd.read_csv('generated_itineraries.csv', usecols = fields)

  interactivity=interactivity, compiler=compiler, result=result)


# Wrange field: destinationIDs

In [121]:
# Replace all of the NAs for destinationIDs with 0
df.selectedDestination_id.fillna(0, inplace = True)

# Remove the 2 cases where the string says null
# Great tutorial here: https://www.youtube.com/watch?v=2AFGPdNn4FM
df = df[df.selectedDestination_id != 'null']

# Convert destinationIDs column to an integer value
df['selectedDestination_id'] = df.selectedDestination_id.astype(int)



# Wrangle field: numItenerariesReturned

In [122]:
# Replace all of the NAs for numItinerariesReturned with 1
df.numItinerariesReturned.fillna(1, inplace = True)

# Convert from float to integer
df['numItinerariesReturned'] = df.numItinerariesReturned.astype(int)

# Select, only observatiosn where this field is greater than 0 (now that the NAs are gone)


# Wrangle Field: Destination Name

In [123]:
#Convert this field to an integer replacing all NA's with zero
# This gets rid of the trailing zeros
df.selectedDestination_name.fillna("", inplace = True)

# Wrangle Field: departureDate

In [124]:
#Convert destinationIDs column to an integer value
# It looks like there were some complex rows being held in here before, I thought that when df.dtypes returned object that
# meant string but apprently not
df['departureDate'] = df.departureDate.astype(str)

print("number of rows before removal of anamoulous departureDate cases")
print(len(df))

# IT looks like there are some cases where this field is blank, says nan, is in format 24503, or in format "masked" 
# We need to remove these cases from the data frame
# I can see that some blank rows are still printed out.
df = df[df.departureDate != '']
df = df[df.departureDate != 'nan']
df = df[df.departureDate != '24503']
df = df[df.departureDate != '[masked]']

print("number of rows after removal of anamoulous departureDate cases")
print(len(df))

# Create a function extractDate that extracts the first ten characters of an input string
def extractDate(dateString):
    extractedDate = dateString[0:10]
    if len(extractedDate) < 10:
        print(extractedDate)
    return extractedDate

''' Code to test if the extractDate function works

# Apply this function to create  a new column
df['departureDateFixed'] = df.departureDate.apply(extractDate)

cols = ['distinct_id', 'departureDate', 'departureDateFixed', 'numItinerariesReturned', 'selectedDestination_id', 'selectedDestination_name', 'startFromLocation']

df = df[cols]
'''

# Override departure date extracting all of the null time stamps
df['departureDate'] = df.departureDate.apply(extractDate)


# Convert departure date into a time object in pandas
#See here: https://stackoverflow.com/questions/26763344/convert-pandas-column-to-datetime
# Though this actually might not need to be done
#df['departureDate'] = df.departureDate.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))




#df['departureDate'] = datetime.strptime(df['departureDate'], '%Y-%m-%-d')  
#df['departureDate'] = pd.to_datetime(df['departureDate'], format = '%Y-%m-%-d')



number of rows before removal of anamoulous departureDate cases
35770
number of rows after removal of anamoulous departureDate cases
35762


# Wrangle Field: distinctID

In [125]:
# Create a coloumn that combines the unix time stamp with distinct_id so that we have a primary key for database
df["primary_key"] = df["distinct_id"] + "-" + df["time"].map(str)
vc = df.primary_key.value_counts()
unique_keys = df.primary_key.unique()

# Wrangle user_id field

In [126]:
df.dtypes

departureDate                object
distinct_id                  object
numItinerariesReturned        int64
selectedDestination_id        int64
selectedDestination_name     object
startFromLocation            object
time                          int64
user_id                     float64
primary_key                  object
dtype: object

In [127]:
# This gets rid of the railing zeros and all of the nas are just
# blank

df['user_id'] = df['user_id'].fillna(0).astype(np.int64)

#Convert to string to be consistent with other fields in database
df['user_id'] = df['user_id'].astype(str)


# Create a subset of the datle with sample method to test geocode and database entry logic

In [128]:
# Create a random sample of the database, these entries will be added to the database in the next section
sampleDf = df.tail(1000)

# Output this random sample
sampleDf.head(len(sampleDf))

Unnamed: 0,departureDate,distinct_id,numItinerariesReturned,selectedDestination_id,selectedDestination_name,startFromLocation,time,user_id,primary_key
34772,2019-08-18,16c90966d38715-0fc7a6ac04dbb38-6427872-3d10d-1...,1,0,,"47.613858998711265,-122.31868693277117",1565768716,827,16c90966d38715-0fc7a6ac04dbb38-6427872-3d10d-1...
34773,2019-08-14,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...,1,0,,Panpacific hotel,1565771855,0,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...
34774,2019-08-14,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...,1,0,,"Canada Place Pier, 100 The Pointe, 999 Canada ...",1565772016,0,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...
34775,2019-08-14,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...,1,0,,"Canada Place Pier, 100 The Pointe, 999 Canada ...",1565772140,0,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...
34776,2019-08-14,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...,1,0,,"Pan Pacific Hotel Bar, Vancouver, British Colu...",1565773832,0,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...
34777,2019-08-18,16c9111838b4e-09fab60505ceac-7373e61-1fa400-16...,1,0,,"Metrotown, V5J 1C8, Burnaby, British Columbia,...",1565776771,0,16c9111838b4e-09fab60505ceac-7373e61-1fa400-16...
34778,2019-08-14,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...,1,0,,"Pan Pacific Hotel Bar, Vancouver, British Colu...",1565780699,0,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...
34779,2019-08-14,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...,1,0,,"Canada Place Pier, 100 The Pointe, 999 Canada ...",1565781738,0,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...
34780,2019-08-14,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...,1,0,,"Canada Place, 999 Canada Pl, Vancouver, Britis...",1565781975,0,16c90c54e451d1-0c3f7ff58d47518-10073e7b-c0000-...
34781,2019-08-14,16c91661d101bd-09ac8e84e9b3e38-4a5a67-13c680-1...,1,0,,"Portland, Oregon, United States",1565782393,949,16c91661d101bd-09ac8e84e9b3e38-4a5a67-13c680-1...


## Read in the destination data to allow the possibility to pull the correct names

 


In [129]:
f = open("destinations_mapping_Jul-30-18.csv")
reader = csv.reader(f)
destinations = {}


# The index at the end of the for loop just skips the first row which is the header in the csv file
next(reader)
for row in reader:
    destinations[row[0]] = {'name':row[1]}

print(destinations)

{'168': {'name': 'Mammoth Pass - Crater Meadow Trail'}, '178': {'name': 'Sport Climbing at Horseshoe Slabs'}, '181': {'name': 'Hazel Wolf Wetlands Loop'}, '183': {'name': 'Redwood Grove Loop Trail'}, '184': {'name': 'Birdwatching at Fowlsheugh Reserve'}, '12': {'name': 'Point Defiance Loop (OLD)'}, '112': {'name': 'Baden Powell Lynn Canyon to Grouse'}, '175': {'name': 'Rainbow Falls Trail'}, '174': {'name': 'Walking the Town Loop to Sherwins Vista'}, '20': {'name': 'The Lions Binkert Trail'}, '27': {'name': 'Tumamoc Hill'}, '8': {'name': 'Proximity Alert Test'}, '169': {'name': 'McLeod Lake Spur'}, '165': {'name': 'Siskiyou Mountain (White Rabbit Trail)'}, '22': {'name': 'Lake Blanca'}, '11': {'name': 'Carkeek Park'}, '162': {'name': 'Rouge Park Traverse'}, '172': {'name': 'Walking the Lakes Basin Path at Twin Lakes'}, '16': {'name': 'Golden Gardens to Carkeek Park Beach Walk'}, '111': {'name': 'Baden Powell Deep Cove to Lynn Canyon'}, '278': {'name': 'Tujunga Wash Path'}, '13': {'name

# Loop through the rows in the dataframe, geocode, add entry to database

In [None]:
# Loop through the subsetted pandas data frame

# Uncomment the code below to loop through the the sample data frame
# for index, row in sampleDf.iterrows():

for index, row in sampleDf.iterrows():
  

    # Pull out the primary key into a variable
    testKey = row["primary_key"]
    
    # Check to see if that distinctID is in the data base
    # See this post: https://stackoverflow.com/questions/6587879/how-to-elegantly-check-the-existence-of-an-object-instance-variable-and-simultan?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
    entryExists = session.query(exists().where(Itenerary.distinctkey==testKey)).scalar()

    # If the entry is not in the database
    if not entryExists:
    
        # Get the string to be geocoded
        locationToGeocode = row["startFromLocation"]

        # Try to run the geocode function that returns a dictionary of information
        try:
            geocodeInfo = geocode(locationToGeocode)
            # If geocoding works, set valid to tre
            valid = True

        # If the geocode function doesn't work set valid to false
        except:
            valid = False 

        # If valid is true create a database entry with information from the dataframe, and the returned geocode informaiton
        if valid:
            
            # Sometimes, such as when a generic city is sent to the geocode() function a geometric center
            # is returned, this means there is no postal code
            
            # In this case, we can set the postalCode to One
            if not 'postalCode' in geocodeInfo:
                geocodeInfo['postalCode'] = "none"

            
            # Mapp the gps coordinates returned to the zip code polygons
            
            zipCodeInfo = mapToPoly(geocodeInfo['lat'], geocodeInfo['lng'], 'postal')
            print(zipCodeInfo)
            
            if zipCodeInfo:
                zipCodeMapped = zipCodeInfo[0]
                region = zipCodeInfo[1]
            else:
                zipCodeMapped = 'outsideRegion'
                zipCodeMapped = 'outsideRegion'
            
            barrioInfo = mapToPoly(geocodeInfo['lat'], geocodeInfo['lng'], 'barrio')
            
            if barrioInfo:
                barrioMapped = barrioInfo[0]
                region = barrioInfo[1]
            else:
                barrioMapped = 'outsideRegion'
                region = 'outsideRegion'
            
            ## Get selected Destination Names
            # Pull the selected destination name
            selectedDestinationName = row["selectedDestination_name"]
            
            ###########
            # TODO:
            # for
            #
            #
            
            
            
            
            if not selectedDestinationName:
                
                key = str(row["selectedDestination_id"])
                
                if key in destinations:
            
                    # Pull the data out from the dictionary that was created in the cell above
                    newName = destinations[str(row["selectedDestination_id"])]['name']
        
                    # Add the new name to the new row
                    selectedDestinationName  = newName
            
                # In the case that there is destination that corresponds mark
                else:
                
                    # TODO: CONSIDER CHANGING THE NAME OF THIS TO SOMETHING ELSE
                    selectedDestinationName = "DELETED"
                    # And overwrite valid to false at this point because there is no destination
                    valid = False
            
            databaseEntry = Itenerary(distinctkey=row["primary_key"],
                                      numberitinerariesreturned=row["numItinerariesReturned"],
                                      selecteddestination_id=row["selectedDestination_id"],
                                      selecteddestination_name=selectedDestinationName,
                                      startfromlocation=row["startFromLocation"],
                                      departuredate=row["departureDate"],
                                      # Get data from python dictionary returned from geocode() function
                                      formatted_address=geocodeInfo['formatted_address'],
                                      lat=geocodeInfo['lat'],
                                      lng=geocodeInfo['lng'],
                                      postalcode=geocodeInfo['postalCode'],
                                      postalcodemapped=zipCodeMapped,
                                      barriomapped=barrioMapped,
                                      userid=row["user_id"],
                                      region=region,
                                      valid=valid)
        # If valid is false, just fill in the information that we have from the pandas data frame
        else:
            databaseEntry = Itenerary(distinctkey=row["primary_key"],
                                      numberitinerariesreturned=row["numItinerariesReturned"],
                                      selecteddestination_id=row["selectedDestination_id"],
                                      selecteddestination_name=row["selectedDestination_name"],
                                      startfromlocation=row["startFromLocation"],
                                      departuredate=row["departureDate"],
                                      userid=row["user_id"],
                                      valid=valid)

        # Add the the information to a database.    
        session.add(databaseEntry)
        session.commit()
    
    else:
        print("Entry already inside database")

Entry already inside database
Entry already inside database
Entry already inside database
Entry already inside database
Entry already inside database
Entry already inside database
Entry already inside database
Entry already inside database
Entry already inside database
45.5051064
-122.6750261
None
45.5051064
-122.6750261
None
49.2663039
-123.1853944
['V6R', 'canada']
49.2663039
-123.1853944
['V6R', 'canada']
49.18899769999999
-122.9314717
['V3M', 'canada']
47.6168523
-122.2971789
['98122', 'washington']
47.6168523
-122.2971789
['98122', 'washington']
49.2492111
-123.078986
['V5V', 'canada']
34.1184341
-118.3003935
['90027', 'california']
47.4866554
-121.710027
['98045', 'washington']
47.662966
-122.1849364
['98033', 'washington']
47.6062095
-122.3320708
['98164', 'washington']
34.1184341
-118.3003935
['90027', 'california']
34.0410008
-118.5625053
['90272', 'california']
34.1007001
-118.3325175
['90028', 'california']
49.2376328
-123.0348774
['V5R', 'canada']
34.0410008
-118.5625053
['

49.2626542
-123.0789474
['V5T', 'canada']
47.6206042
-122.1496937
['98007', 'washington']
47.6738153
-122.2586117
['98115', 'washington']
49.2604563
-123.179153
['V6R', 'canada']
49.2291616
-123.1561648
['V6M', 'canada']
49.2604563
-123.179153
['V6R', 'canada']
49.2604563
-123.179153
['V6R', 'canada']
49.3055289
-123.0301343
['V7J', 'canada']
47.6542104
-122.3275757
['98105', 'washington']
47.65642070000001
-122.3196439
['98105', 'washington']
49.15911089999999
-123.1347939
['V6Y', 'canada']
49.15911089999999
-123.1347939
['V6Y', 'canada']
49.2892771
-123.1293831
['V6G', 'canada']
49.2892771
-123.1293831
['V6G', 'canada']
49.2603674
-123.1790389
['V6R', 'canada']
47.6062095
-122.3320708
['98164', 'washington']
47.6153987
-122.3537968
['98121', 'washington']
47.6153987
-122.3537968
['98121', 'washington']
47.6153987
-122.3537968
['98121', 'washington']
49.2330134
-123.0596668
['V5P', 'canada']
49.2330134
-123.0596668
['V5P', 'canada']
49.2603674
-123.1790389
['V6R', 'canada']
47.6153987

49.1234665
-122.8474553
['V3W', 'canada']
49.2393357
-123.1904698
['V6N', 'canada']
49.2888248
-123.1111209
['V6C', 'canada']
49.033033
-122.304926
['V2S', 'canada']
47.6062095
-122.3320708
['98164', 'washington']
47.6156862
-122.333027
['98101', 'washington']
47.6384586
-122.3674079
['98119', 'washington']
47.6492873
-122.3068507
['98195', 'washington']
47.6492873
-122.3068507
['98195', 'washington']
47.4514588
-122.826946
['98528', 'washington']
47.4502499
-122.3088165
['98158', 'washington']
49.248321
-123.055908
['V5R', 'canada']
47.6648149
-122.3343506
['98103', 'washington']
48.68687389999999
-123.4727842
['V8L', 'canada']
49.3227331
-122.9512704
['V7G', 'canada']
49.283162
-123.1159226
['V6B', 'canada']
49.283162
-123.1159226
['V6B', 'canada']
47.6156035
-122.2977477
['98122', 'washington']
49.2393357
-123.1904698
['V6N', 'canada']
47.608753
-122.2965286
['98122', 'washington']
47.6133451
-122.3231128
['98122', 'washington']
47.602958
-122.336717
['98104', 'washington']
47.31638

49.2827291
-123.1207375
['V6C', 'canada']
49.2827291
-123.1207375
['V6C', 'canada']
49.2827291
-123.1207375
['V6C', 'canada']
49.2827291
-123.1207375
['V6C', 'canada']
49.2827291
-123.1207375
['V6C', 'canada']
49.2827291
-123.1207375
['V6C', 'canada']
49.2831758
-123.1032851
['V6A', 'canada']
49.2831758
-123.1032851
['V6A', 'canada']
49.2831758
-123.1032851
['V6A', 'canada']
49.2831758
-123.1032851
['V6A', 'canada']
49.2831758
-123.1032851
['V6A', 'canada']
49.2831758
-123.1032851
['V6A', 'canada']
49.2831758
-123.1032851
['V6A', 'canada']
49.2831758
-123.1032851
['V6A', 'canada']
49.1568044
-122.8202605
['V3S', 'canada']
49.1546114
-122.8172354
['V3S', 'canada']
49.1546114
-122.8172354
['V3S', 'canada']
49.1546114
-122.8172354
['V3S', 'canada']
49.1546114
-122.8172354
['V3S', 'canada']
49.2502266
-122.8958023
['V3J', 'canada']
49.2830972
-123.1175032
['V6C', 'canada']
49.2502266
-122.8958023
['V3J', 'canada']
49.23829200000001
-123.0776062
['V5W', 'canada']
47.518674
-122.3482444
['98

47.4502499
-122.3088165
['98158', 'washington']
47.6336231
-122.3451713
['98109', 'washington']
49.2852945
-123.119563
['V6C', 'canada']
49.2852945
-123.119563
['V6C', 'canada']
49.2852945
-123.119563
['V6C', 'canada']
49.2852945
-123.119563
['V6C', 'canada']
49.2852945
-123.119563
['V6C', 'canada']
49.2852945
-123.119563
['V6C', 'canada']
47.9789848
-122.2020795
['98201', 'washington']
47.9789848
-122.2020795
['98201', 'washington']
47.6247835
-122.3609193
['98119', 'washington']
37.558613
-97.27443160000001
None
47.70273299999999
-122.3629475
['98177', 'washington']
47.6062095
-122.3320708
['98164', 'washington']
47.6148089
-122.3151473
['98122', 'washington']
49.2852945
-123.119563
['V6C', 'canada']
45.5382504
-122.6651619
None
45.5382504
-122.6651619
None
49.2584339
-123.152261
['V6J', 'canada']
49.2584339
-123.152261
['V6J', 'canada']
47.48362059999999
-122.2476724
['98178', 'washington']
47.48362059999999
-122.2476724
['98178', 'washington']
49.28499009999999
-123.1270634
['V6E',

49.1934169
-122.853878
['V3T', 'canada']
49.1934169
-122.853878
['V3T', 'canada']
49.1934169
-122.853878
['V3T', 'canada']
49.1934169
-122.853878
['V3T', 'canada']
49.1934169
-122.853878
['V3T', 'canada']
47.4956579
-121.7867775
['98045', 'washington']
47.6084287
-122.340532
['98101', 'washington']
47.6084287
-122.340532
['98101', 'washington']
47.6084287
-122.340532
['98101', 'washington']
47.615498
-122.1950012
['98004', 'washington']
49.283181
-123.103995
['V6A', 'canada']
47.6143443
-122.3419422
['98121', 'washington']
47.6143443
-122.3419422
['98121', 'washington']
49.1709306
-123.1416697
['V6X', 'canada']
47.6143443
-122.3419422
['98121', 'washington']
49.1709306
-123.1416697
['V6X', 'canada']
49.3055289
-123.0301343
['V7J', 'canada']
49.1709306
-123.1416697
['V6X', 'canada']
49.1709306
-123.1416697
['V6X', 'canada']
49.264332
-122.9896241
['V5B', 'canada']
49.264332
-122.9896241
['V5B', 'canada']
49.264332
-122.9896241
['V5B', 'canada']
34.048569
-118.2528917
['90013', 'californ