# Import libraries, set options, connect to DB

In [37]:
# Configuration code for datawrangling
import pandas as pd
import os
import numpy as np
from datetime import datetime
from geocode import geocode
import mapToPoly
from mapToPoly import mapToPoly
pd.set_option('display.max_row', 30000)
import csv

# Configuration code in order to connect to the database
from sqlalchemy import create_engine, exists
from sqlalchemy.orm import sessionmaker
from database_setup import Itenerary, Base

passWord = os.environ['my_password']
DATABASE_URI = 'postgres+psycopg2://maxcarey:' + passWord + '@localhost:5432/totago'
engine = create_engine(DATABASE_URI)

#engine = create_engine('sqlite:///totagoData.db')

# Bind the engine to the metadata of the Base class so that the
# declaratives can be accessed through a DBSession instance
Base.metadata.bind = engine

DBSession = sessionmaker(bind=engine)

session = DBSession()

# Read in data as pandas data frame, selecting only certain fields

In [38]:
fields = ['distinct_id', 'numItinerariesReturned', 'departureDate', 'startFromLocation', 'selectedDestination_id', 'selectedDestination_name', 'time']

In [39]:
df = pd.read_csv('generated_itineraries.csv', usecols = fields)

  interactivity=interactivity, compiler=compiler, result=result)


# Wrange field: destinationIDs

In [40]:
# Replace all of the NAs for destinationIDs with 0
df.selectedDestination_id.fillna(0, inplace = True)

# Remove the 2 cases where the string says null
# Great tutorial here: https://www.youtube.com/watch?v=2AFGPdNn4FM
df = df[df.selectedDestination_id != 'null']

# Convert destinationIDs column to an integer value
df['selectedDestination_id'] = df.selectedDestination_id.astype(int)



# Wrangle field: numItenerariesReturned

In [41]:
# Replace all of the NAs for numItinerariesReturned with 1
df.numItinerariesReturned.fillna(1, inplace = True)

# Convert from float to integer
df['numItinerariesReturned'] = df.numItinerariesReturned.astype(int)

# Select, only observatiosn where this field is greater than 0 (now that the NAs are gone)


# Wrangle Field: Destination Name

In [42]:
#Replace all of the NAs in
df.selectedDestination_name.fillna("", inplace = True)

print("Number of rows before departure date: ")
print(len(df))


Number of rows before departure date: 
28138


# Wrangle Field: departureDate

In [43]:
#Convert destinationIDs column to an integer value
# It looks like there were some complex rows being held in here before, I thought that when df.dtypes returned object that
# meant string but apprently not
df['departureDate'] = df.departureDate.astype(str)

print("number of rows before removal of anamoulous departureDate cases")
print(len(df))


# IT looks like there are some cases where this field is blank, says nan, is in format 24503, or in format "masked" 
# We need to remove these cases from the data frame
# I can see that some blank rows are still printed out.
df = df[df.departureDate != '']
df = df[df.departureDate != 'nan']
df = df[df.departureDate != '24503']
df = df[df.departureDate != '[masked]']


print("number of rows after removal of anamoulous departureDate cases")
print(len(df))

# Create a function extractDate that extracts the first ten characters of an input string
def extractDate(dateString):
    extractedDate = dateString[0:10]
    if len(extractedDate) < 10:
        print(extractedDate)
    return extractedDate

''' Code to test if the extractDate function works

# Apply this function to create  a new column
df['departureDateFixed'] = df.departureDate.apply(extractDate)

cols = ['distinct_id', 'departureDate', 'departureDateFixed', 'numItinerariesReturned', 'selectedDestination_id', 'selectedDestination_name', 'startFromLocation']

df = df[cols]
'''

# Override departure date extracting all of the null time stamps
df['departureDate'] = df.departureDate.apply(extractDate)


# Convert departure date into a time object in pandas
#See here: https://stackoverflow.com/questions/26763344/convert-pandas-column-to-datetime
# Though this actually might not need to be done
#df['departureDate'] = df.departureDate.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))




#df['departureDate'] = datetime.strptime(df['departureDate'], '%Y-%m-%-d')  
#df['departureDate'] = pd.to_datetime(df['departureDate'], format = '%Y-%m-%-d')



number of rows before removal of anamoulous departureDate cases
28138
number of rows after removal of anamoulous departureDate cases
28130


# Wrangle Field: distinctID

In [44]:
#It turns out distinc_id correpsonds to a user

# Therefore, create a coloumn that combines the unix time stamp with distinct_id so that we have a primary key for database
df["primary_key"] = df["distinct_id"] + "-" + df["time"].map(str)


vc = df.primary_key.value_counts()
print(vc[vc > 1])


unique_keys = df.primary_key.unique()
#print(len(unique_keys))

#df.head(n = len(df))

156c0427d8411b-089cb3f5e0f3f-1c114a5c-13c680-156c0427d85c5-1472086519       3
16017f5f592536-094b655d7b8ce4-7636321b-4a640-16017f5f5932bd-1512207331      3
15ae497624281a-06e6a097b99f59-5e4f2b18-ff000-15ae4976243a16-1489889250      3
68f3348e-32ff-4756-a6b9-fbf722d5bf76-1460184823                             3
2578f3c5-2bd8-4d7e-b05e-d82eb4e24632-1448989343                             3
16693ed1-fd53-4577-8cbe-5ca19ff74b89-1509430935                             2
ab4ac825-e153-4b5e-8296-7edb8c32c6ab-1541125487                             2
9a942f21-ad25-4129-b760-46bcec9e631d-1448989392                             2
2b9795d4-912a-4c5b-b628-1ccd366c488f-1466266682                             2
f67face6-b99b-4ddc-8326-3dbe6998c01c-1490958347                             2
cb9dcbce-753b-4fe5-a91a-1a58a779263c-1494111569                             2
c7ca4b64-84f0-46dd-b8e1-00530a09f7d9-1443084797                             2
156f5faf7931f1-0554d455a-6a03173d-3d10d-156f5faf7943b9-147298066

# Create a subset of the datle with sample method to test geocode and database entry logic

In [45]:
#Out put the entire database
#df.head(len(df))

len(df)

28130

In [46]:
#Create a random sample of the database, these entries will be added to the database in the next section
sampleDf = df.sample(2000)

# Output this random sample
sampleDf.head(len(sampleDf))    

Unnamed: 0,departureDate,distinct_id,numItinerariesReturned,selectedDestination_id,selectedDestination_name,startFromLocation,time,primary_key
361,2016-01-28,e33ea58e-8188-4bdd-9007-9ce54b602cb7,1,4,,My Location,1453976104,e33ea58e-8188-4bdd-9007-9ce54b602cb7-1453976104
6742,2016-10-21,157e5c479b71e7-0d3e27670400e9-a33316d-100200-1...,1,122,,"Walter Gage Residence, Student Union Boulevard...",1477003480,157e5c479b71e7-0d3e27670400e9-a33316d-100200-1...
12294,2017-06-20,15cc41b9c581-0ffbbe27f0d24a-24293c49-38400-15c...,1,136,Lighthouse Park,"Georgia Street, Vancouver, BC, Canada",1497913636,15cc41b9c581-0ffbbe27f0d24a-24293c49-38400-15c...
17837,2017-11-04,15f7e5aa75bbdc-0159a3fcb028f-3b3e5906-140000-1...,1,4,West Tiger No. 3,"Highline College, Des Moines, WA, United States",1509714636,15f7e5aa75bbdc-0159a3fcb028f-3b3e5906-140000-1...
9256,2017-03-27,15b0df83afb15-0c3fca4e19aa4d-6f091501-38400-15...,1,118,,"Coquitlam, BC, Canada",1490564977,15b0df83afb15-0c3fca4e19aa4d-6f091501-38400-15...
475,2016-03-04,4ad65471-47b5-49ff-a508-bae704625604,1,3,,My Location,1457094911,4ad65471-47b5-49ff-a508-bae704625604-1457094911
4684,2016-09-03,156c768dfa8583-0902a522a3848e-7d2d6750-100200-...,1,130,,"Vancouver, BC V6G 1K3, Canada",1472199715,156c768dfa8583-0902a522a3848e-7d2d6750-100200-...
11115,2017-05-24,15c38df4e4c53-0198fc3f6fa73d-5d79342b-c0000-15...,1,141,,"199 Drake Street, Vancouver, BC, Canada",1495578981,15c38df4e4c53-0198fc3f6fa73d-5d79342b-c0000-15...
23890,2018-07-18,16480f1d2102ee-05da1bf05fabc4-16396952-fa000-1...,1,117,Brothers Creek Loop,"3984 Lillooet Street, Vancouver, BC, Canada",1531146529,16480f1d2102ee-05da1bf05fabc4-16396952-fa000-1...
1380,2016-06-22,1557a22d1631f5-0eef8db5f07166-37607b02-ca800-1...,1,124,,"Carvolth Exchange, Langley, BC, Canada",1466607918,1557a22d1631f5-0eef8db5f07166-37607b02-ca800-1...


## Read in the destination data to allow the possibility to pull the correct names

 


In [47]:
f = open("destinations_mapping_Jul-30-18.csv")

reader = csv.reader(f)


destinations = {}


# The index at the end of the for loop just skips the first row which is the header in the csv file
next(reader)
for row in reader:
    destinations[row[0]] = {'name':row[1]}

print(destinations)

{'168': {'name': 'Mammoth Pass - Crater Meadow Trail'}, '178': {'name': 'Sport Climbing at Horseshoe Slabs'}, '181': {'name': 'Hazel Wolf Wetlands Loop'}, '183': {'name': 'Redwood Grove Loop Trail'}, '184': {'name': 'Birdwatching at Fowlsheugh Reserve'}, '12': {'name': 'Point Defiance Loop (OLD)'}, '112': {'name': 'Baden Powell Lynn Canyon to Grouse'}, '175': {'name': 'Rainbow Falls Trail'}, '174': {'name': 'Walking the Town Loop to Sherwins Vista'}, '20': {'name': 'The Lions Binkert Trail'}, '27': {'name': 'Tumamoc Hill'}, '8': {'name': 'Proximity Alert Test'}, '169': {'name': 'McLeod Lake Spur'}, '165': {'name': 'Siskiyou Mountain (White Rabbit Trail)'}, '22': {'name': 'Lake Blanca'}, '11': {'name': 'Carkeek Park'}, '162': {'name': 'Rouge Park Traverse'}, '172': {'name': 'Walking the Lakes Basin Path at Twin Lakes'}, '16': {'name': 'Golden Gardens to Carkeek Park Beach Walk'}, '111': {'name': 'Baden Powell Deep Cove to Lynn Canyon'}, '278': {'name': 'Tujunga Wash Path'}, '13': {'name

# Loop through the rows in the dataframe, geocode, add entry to database

In [48]:
# Loop through the subsetted pandas data frame

# Uncomment the code below to loop through the the sample data frame
# for index, row in sampleDf.iterrows():

for index, row in sampleDf.iterrows():
  

    # Pull out the primary key into a variable
    testKey = row["primary_key"]
    
    # Check to see if that distinctID is in the data base
    # See this post: https://stackoverflow.com/questions/6587879/how-to-elegantly-check-the-existence-of-an-object-instance-variable-and-simultan?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
    entryExists = session.query(exists().where(Itenerary.distinctkey==testKey)).scalar()

    # If the entry is not in the database
    if not entryExists:
    
        # Get the string to be geocoded
        locationToGeocode = row["startFromLocation"]

        # Try to run the geocode function that returns a dictionary of information
        try:
            geocodeInfo = geocode(locationToGeocode)
            # If geocoding works, set valid to tre
            valid = True

        # If the geocode function doesn't work set valid to false
        except:
            valid = False 

        # If valid is true create a database entry with information from the dataframe, and the returned geocode informaiton
        if valid:
            
            # Sometimes, such as when a generic city is sent to the geocode() function a geometric center
            # is returned, this means there is no postal code
            
            # In this case, we can set the postalCode to One
            if not 'postalCode' in geocodeInfo:
                geocodeInfo['postalCode'] = "none"

            
            # Mapp the gps coordinates returned to the zip code polygons
            zipCodeMapped = mapToPoly(geocodeInfo['lat'], geocodeInfo['lng'], 'postal')
            
            barrioMapped = mapToPoly(geocodeInfo['lat'], geocodeInfo['lng'], 'barrio')
            
            ## Get selected Destination Names
            # Pull the selected destination name
            selectedDestinationName = row["selectedDestination_name"]
            
            if not selectedDestinationName:
                
                key = str(row["selectedDestination_id"])
                
                if key in destinations:
            
                    # Pull the data out from the dictionary that was created in the cell above
                    newName = destinations[str(row["selectedDestination_id"])]['name']
        
                    # Add the new name to the new row
                    selectedDestinationName  = newName
            
                # In the case that there is destination that corresponds mark
                else:
                
                    # TODO: CONSIDER CHANGING THE NAME OF THIS TO SOMETHING ELSE
                    selectedDestinationName = "DELETED"
                    # And overwrite valid to false at this point because there is no destination
                    valid = False
                
            databaseEntry = Itenerary(distinctkey=row["primary_key"],
                                      numberitinerariesreturned=row["numItinerariesReturned"],
                                      selecteddestination_id=row["selectedDestination_id"],
                                      selecteddestination_name=selectedDestinationName,
                                      startfromlocation=row["startFromLocation"],
                                      departuredate=row["departureDate"],
                                      # Get data from python dictionary returned from geocode() function
                                      formatted_address=geocodeInfo['formatted_address'],
                                      lat=geocodeInfo['lat'],
                                      lng=geocodeInfo['lng'],
                                      postalcode=geocodeInfo['postalCode'],
                                      postalcodemapped=zipCodeMapped,
                                      barriomapped=barrioMapped,
                                      valid=valid)
        # If valid is false, just fill in the information that we have from the pandas data frame
        else:
            databaseEntry = Itenerary(distinctkey=row["primary_key"],
                                      numberitinerariesreturned=row["numItinerariesReturned"],
                                      selecteddestination_id=row["selectedDestination_id"],
                                      selecteddestination_name=row["selectedDestination_name"],
                                      startfromlocation=row["startFromLocation"],
                                      departuredate=row["departureDate"],
                                      valid=valid)

        # Add the the information to a database.    
        session.add(databaseEntry)
        session.commit()
    
    else:
        print("Entry already inside database")

V6T
V6E
CBD
98198
762832
V3B
V6G
WE
V6Z
CBD
V5R
RC
V2Y
V6A
CBD
V6C
CBD
V6T
V7P
V6C
CBD
V6Y
V5G
V6A
CBD
98033
274599
98105
272001
98117
250788
V5E
V6E
WE
90005
268236
V6B
CBD
V6J
SHAU
V6J
SHAU
V5A
V6B
CBD
98122
250206
V7P
98105
272001
98121
343995
90027
115295
V6B
CBD
V6C
CBD
V5L
GW
90028
32059
V6C
CBD
V5V
RP
V6M
KERR
V6C
CBD
V5R
RC
V5N
GW
V3H
V7M
V3L
V5Z
SC
90046
763065
V6Y
90292
21056
V5R
RC
V7M
V5N
KC
V6S
DS
98104
271893
V5N
KC
V3T
V6E
WE
98121
343995
V6E
WE
V6C
CBD
V6H
FAIR
98004
271850
V5E
V4K
V3T
V6A
CBD
V6C
CBD
V3T
V5W
SUN
V5T
MP
V6C
CBD
V5Z
FAIR
V5A
V6B
CBD
V5C
V6K
KITS
V5M
RC
90028
32059
V6B
CBD
V5C
V6K
KITS
98115
251709
V3M
V5K
HS
V3C
98102
250206
V7L
V5R
RC
V6C
CBD
V3C
V7M
98105
252248
V7T
V5T
MP
V6E
WE
V5R
RC
V5R
RC
V5C
V6C
CBD
V6J
SHAU
V6A
CBD
V5N
GW
V5L
GW
V6J
KITS
98101
271869
V5V
RP
V5R
RC
92101
273349
98164
271849
V6B
CBD
98104
271849
V6P
MARP
90015
268118
V5E
V6B
CBD
V6G
WE
V5N
GW
V6B
CBD
V5Z
SC
V6C
CBD
V3M
V5Y
MP
V5R
RC
V6G
WE
V6B
CBD
98164
271849
98102
250206
V7M
981

98102
250206
V6C
CBD
98109
272018
98105
272001
V5A
V6E
WE
V6C
CBD
V6E
WE
V6C
CBD
V6P
MARP
98104
271849
V6B
CBD
V5Z
SC
V6T
94558
763797
V6X
V6T
V6G
WE
94704
762301
V7Y
CBD
V6J
KITS
V5R
RC
V7A
V6B
CBD
V6B
CBD
V6B
CBD
V6C
CBD
V7S
V6Y
98102
250206
98105
272001
V5E
V3H
V6E
WE
V6T
98101
343995
V6J
SHAU
V6Z
CBD
V6C
CBD
V6E
WE
V6S
DS
V6C
CBD
V6W
V5N
KC
V6P
KERR
V6C
CBD
V6P
OAK
V6C
CBD
98102
271857
V7S
V3M
V6B
CBD
98164
271849
98074
98119
272018
V6P
MARP
V6P
MARP
V6C
CBD
V6B
CBD
98109
272022
V6G
WE
V6G
WE
V6E
CBD
V6R
WPG
V6C
CBD
V6X
V6P
MARP
V5R
RC
92037
46087
V6R
KITS
V5V
KC
V6B
CBD
V6E
CBD
V6Y
V7M
V6K
KITS
V3M
10007
270951
V3A
93546
V5Y
OAK
V6B
CBD
V4A
90005
268236
V6N
DS
V3L
V6C
CBD
V6Z
CBD
V5H
V6E
WE
V3T
V5R
RC
V6P
MARP
98107
250017
94541
98164
271849
V5Z
OAK
V6Y
V6T
V5R
RC
14519
V6K
KITS
V5C
V3J
12401
V6G
WE
V6G
WE
V6S
V6B
CBD
V6C
CBD
V6E
WE
V6T
98027
762908
98164
271849
V5Y
MP
98105
272001
V5L
GW
V3B
V6T
V3V
V6E
CBD
V4C
V6Y
V6B
CBD
V6T
V6E
WE
V6H
FAIR
V5Y
OAK
V5Z
OAK
91750
191484
V6B
CBD
