# Import libraries, set options, connect to DB

In [1]:
# Configuration code for datawrangling
import pandas as pd
import os
import numpy as np
from datetime import datetime
from geocode import geocode
import mapToPoly
from mapToPoly import mapToPoly
pd.set_option('display.max_row', 30000)
import csv

# Configuration code in order to connect to the database
from sqlalchemy import create_engine, exists
from sqlalchemy.orm import sessionmaker
from database_setup import Itenerary, Base

passWord = os.environ['my_password']
DATABASE_URI = 'postgres+psycopg2://maxcarey:' + passWord + '@localhost:5432/totago'
engine = create_engine(DATABASE_URI)

#engine = create_engine('sqlite:///totagoData.db')

# Bind the engine to the metadata of the Base class so that the
# declaratives can be accessed through a DBSession instance
Base.metadata.bind = engine

DBSession = sessionmaker(bind=engine)

session = DBSession()

  """)


True
True


# Read in data as pandas data frame, selecting only certain fields

In [2]:
fields = ['distinct_id', 'numItinerariesReturned', 'departureDate', 'startFromLocation', 'selectedDestination_id', 'selectedDestination_name', 'time']

In [3]:
df = pd.read_csv('generated_itineraries.csv', usecols = fields)

  interactivity=interactivity, compiler=compiler, result=result)


# Wrange field: destinationIDs

In [4]:
# Replace all of the NAs for destinationIDs with 0
df.selectedDestination_id.fillna(0, inplace = True)

# Remove the 2 cases where the string says null
# Great tutorial here: https://www.youtube.com/watch?v=2AFGPdNn4FM
df = df[df.selectedDestination_id != 'null']

# Convert destinationIDs column to an integer value
df['selectedDestination_id'] = df.selectedDestination_id.astype(int)



# Wrangle field: numItenerariesReturned

In [5]:
# Replace all of the NAs for numItinerariesReturned with 1
df.numItinerariesReturned.fillna(1, inplace = True)

# Convert from float to integer
df['numItinerariesReturned'] = df.numItinerariesReturned.astype(int)

# Select, only observatiosn where this field is greater than 0 (now that the NAs are gone)


# Wrangle Field: Destination Name

In [6]:
#Replace all of the NAs in
df.selectedDestination_name.fillna("", inplace = True)

print("Number of rows before departure date: ")
print(len(df))


Number of rows before departure date: 
28138


# Wrangle Field: departureDate

In [7]:
#Convert destinationIDs column to an integer value
# It looks like there were some complex rows being held in here before, I thought that when df.dtypes returned object that
# meant string but apprently not
df['departureDate'] = df.departureDate.astype(str)

print("number of rows before removal of anamoulous departureDate cases")
print(len(df))


# IT looks like there are some cases where this field is blank, says nan, is in format 24503, or in format "masked" 
# We need to remove these cases from the data frame
# I can see that some blank rows are still printed out.
df = df[df.departureDate != '']
df = df[df.departureDate != 'nan']
df = df[df.departureDate != '24503']
df = df[df.departureDate != '[masked]']


print("number of rows after removal of anamoulous departureDate cases")
print(len(df))

# Create a function extractDate that extracts the first ten characters of an input string
def extractDate(dateString):
    extractedDate = dateString[0:10]
    if len(extractedDate) < 10:
        print(extractedDate)
    return extractedDate

''' Code to test if the extractDate function works

# Apply this function to create  a new column
df['departureDateFixed'] = df.departureDate.apply(extractDate)

cols = ['distinct_id', 'departureDate', 'departureDateFixed', 'numItinerariesReturned', 'selectedDestination_id', 'selectedDestination_name', 'startFromLocation']

df = df[cols]
'''

# Override departure date extracting all of the null time stamps
df['departureDate'] = df.departureDate.apply(extractDate)


# Convert departure date into a time object in pandas
#See here: https://stackoverflow.com/questions/26763344/convert-pandas-column-to-datetime
# Though this actually might not need to be done
#df['departureDate'] = df.departureDate.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))




#df['departureDate'] = datetime.strptime(df['departureDate'], '%Y-%m-%-d')  
#df['departureDate'] = pd.to_datetime(df['departureDate'], format = '%Y-%m-%-d')



number of rows before removal of anamoulous departureDate cases
28138
number of rows after removal of anamoulous departureDate cases
28130


# Wrangle Field: distinctID

In [8]:
#It turns out distinc_id correpsonds to a user

# Therefore, create a coloumn that combines the unix time stamp with distinct_id so that we have a primary key for database
df["primary_key"] = df["distinct_id"] + "-" + df["time"].map(str)


vc = df.primary_key.value_counts()
print(vc[vc > 1])


unique_keys = df.primary_key.unique()
#print(len(unique_keys))

#df.head(n = len(df))

68f3348e-32ff-4756-a6b9-fbf722d5bf76-1460184823                             3
2578f3c5-2bd8-4d7e-b05e-d82eb4e24632-1448989343                             3
156c0427d8411b-089cb3f5e0f3f-1c114a5c-13c680-156c0427d85c5-1472086519       3
15ae497624281a-06e6a097b99f59-5e4f2b18-ff000-15ae4976243a16-1489889250      3
16017f5f592536-094b655d7b8ce4-7636321b-4a640-16017f5f5932bd-1512207331      3
2578f3c5-2bd8-4d7e-b05e-d82eb4e24632-1448989361                             2
16693ed1-fd53-4577-8cbe-5ca19ff74b89-1509430935                             2
1657f9a517fcd8-0eec0990ac17ad-49183707-13c680-1657f9a5180e51-1535419115     2
2578f3c5-2bd8-4d7e-b05e-d82eb4e24632-1448989336                             2
2578f3c5-2bd8-4d7e-b05e-d82eb4e24632-1448989357                             2
6059a95c-9075-49c1-bfce-dc5420759ed3-1469199106                             2
9a942f21-ad25-4129-b760-46bcec9e631d-1448989377                             2
2578f3c5-2bd8-4d7e-b05e-d82eb4e24632-1448989332                 

# Create a subset of the datle with sample method to test geocode and database entry logic

In [9]:
#Out put the entire database
#df.head(len(df))

len(df)

28130

In [10]:
#Create a random sample of the database, these entries will be added to the database in the next section
sampleDf = df.sample(2000)

# Output this random sample
sampleDf.head(len(sampleDf))    

Unnamed: 0,departureDate,distinct_id,numItinerariesReturned,selectedDestination_id,selectedDestination_name,startFromLocation,time,primary_key
23357,2018-06-27,164427cea4911-0bc23b8219960a-6c0d352f-13c680-1...,1,131,Grouse Grind,"557 Templeton Drive, Vancouver, BC, Canada",1530098661,164427cea4911-0bc23b8219960a-6c0d352f-13c680-1...
10344,2017-05-13,15bd0ab73792c5-06c8c0fc9f704f8-7d7d326e-3d10d-...,1,55,,"Kennedy Falls, North Vancouver, BC, Canada",1493829608,15bd0ab73792c5-06c8c0fc9f704f8-7d7d326e-3d10d-...
3818,2016-08-11,6b30679b-1076-45d2-8530-3cfc5e65345e,1,0,,My Location,1470877759,6b30679b-1076-45d2-8530-3cfc5e65345e-1470877759
11874,2017-06-09,15c8fa1154528b-0ada972baf01bd-30637509-13c680-...,1,140,,"1040 Pacific Street, Vancouver, BC, Canada",1497033269,15c8fa1154528b-0ada972baf01bd-30637509-13c680-...
5064,2016-09-01,156e7995d012e9-0d55cc6151dea48-4265008-144000-...,1,9,,"1000 4th Avenue, Seattle, WA, United States",1472739324,156e7995d012e9-0d55cc6151dea48-4265008-144000-...
1881,2016-07-02,155accc73471-00871e66c056de-716b6d-38400-155ac...,1,117,,"Nanaimo Station, Vancouver, BC, Canada",1467475228,155accc73471-00871e66c056de-716b6d-38400-155ac...
9356,2017-04-02,15b2d5e9c4712-04a17be661a38-361a515d-38400-15b...,1,146,,"3428 Wellington Avenue, Vancouver, BC, Canada",1491117263,15b2d5e9c4712-04a17be661a38-361a515d-38400-15b...
13544,2017-07-12,15d34e445741f0-02c529dd141576-3f63694b-fa000-1...,1,146,Quarry Rock,"930 Cambie Street, Vancouver, BC, Canada",1499805789,15d34e445741f0-02c529dd141576-3f63694b-fa000-1...
9343,2017-04-18,15b2c8ca9791b0-07138f9a39925-5b123312-100200-1...,1,110,,"455 West King Edward Avenue, Vancouver, BC, Ca...",1491075937,15b2c8ca9791b0-07138f9a39925-5b123312-100200-1...
14449,2017-07-27,116,1,426,Gateway Trail,"2222 54th Avenue Northeast, Seattle, WA, Unite...",1501150243,116-1501150243


## Read in the destination data to allow the possibility to pull the correct names

 


In [11]:
f = open("destinations_mapping_Jul-30-18.csv")

reader = csv.reader(f)


destinations = {}


# The index at the end of the for loop just skips the first row which is the header in the csv file
next(reader)
for row in reader:
    destinations[row[0]] = {'name':row[1]}

print(destinations)

{'168': {'name': 'Mammoth Pass - Crater Meadow Trail'}, '178': {'name': 'Sport Climbing at Horseshoe Slabs'}, '181': {'name': 'Hazel Wolf Wetlands Loop'}, '183': {'name': 'Redwood Grove Loop Trail'}, '184': {'name': 'Birdwatching at Fowlsheugh Reserve'}, '12': {'name': 'Point Defiance Loop (OLD)'}, '112': {'name': 'Baden Powell Lynn Canyon to Grouse'}, '175': {'name': 'Rainbow Falls Trail'}, '174': {'name': 'Walking the Town Loop to Sherwins Vista'}, '20': {'name': 'The Lions Binkert Trail'}, '27': {'name': 'Tumamoc Hill'}, '8': {'name': 'Proximity Alert Test'}, '169': {'name': 'McLeod Lake Spur'}, '165': {'name': 'Siskiyou Mountain (White Rabbit Trail)'}, '22': {'name': 'Lake Blanca'}, '11': {'name': 'Carkeek Park'}, '162': {'name': 'Rouge Park Traverse'}, '172': {'name': 'Walking the Lakes Basin Path at Twin Lakes'}, '16': {'name': 'Golden Gardens to Carkeek Park Beach Walk'}, '111': {'name': 'Baden Powell Deep Cove to Lynn Canyon'}, '278': {'name': 'Tujunga Wash Path'}, '13': {'name

# Loop through the rows in the dataframe, geocode, add entry to database

In [None]:
# Loop through the subsetted pandas data frame

# Uncomment the code below to loop through the the sample data frame
# for index, row in sampleDf.iterrows():

for index, row in sampleDf.iterrows():
  

    # Pull out the primary key into a variable
    testKey = row["primary_key"]
    
    # Check to see if that distinctID is in the data base
    # See this post: https://stackoverflow.com/questions/6587879/how-to-elegantly-check-the-existence-of-an-object-instance-variable-and-simultan?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
    entryExists = session.query(exists().where(Itenerary.distinctkey==testKey)).scalar()

    # If the entry is not in the database
    if not entryExists:
    
        # Get the string to be geocoded
        locationToGeocode = row["startFromLocation"]

        # Try to run the geocode function that returns a dictionary of information
        try:
            geocodeInfo = geocode(locationToGeocode)
            # If geocoding works, set valid to tre
            valid = True

        # If the geocode function doesn't work set valid to false
        except:
            valid = False 

        # If valid is true create a database entry with information from the dataframe, and the returned geocode informaiton
        if valid:
            
            # Sometimes, such as when a generic city is sent to the geocode() function a geometric center
            # is returned, this means there is no postal code
            
            # In this case, we can set the postalCode to One
            if not 'postalCode' in geocodeInfo:
                geocodeInfo['postalCode'] = "none"

            
            # Mapp the gps coordinates returned to the zip code polygons
            zipCodeMapped = mapToPoly(geocodeInfo['lat'], geocodeInfo['lng'], 'postal')
            
            barrioMapped = mapToPoly(geocodeInfo['lat'], geocodeInfo['lng'], 'barrio')
            
            ## Get selected Destination Names
            # Pull the selected destination name
            selectedDestinationName = row["selectedDestination_name"]
            
            if not selectedDestinationName:
                
                key = str(row["selectedDestination_id"])
                
                if key in destinations:
            
                    # Pull the data out from the dictionary that was created in the cell above
                    newName = destinations[str(row["selectedDestination_id"])]['name']
        
                    # Add the new name to the new row
                    selectedDestinationName  = newName
            
                # In the case that there is destination that corresponds mark
                else:
                
                    # TODO: CONSIDER CHANGING THE NAME OF THIS TO SOMETHING ELSE
                    selectedDestinationName = "DELETED"
                    # And overwrite valid to false at this point because there is no destination
                    valid = False
                
            databaseEntry = Itenerary(distinctkey=row["primary_key"],
                                      numberitinerariesreturned=row["numItinerariesReturned"],
                                      selecteddestination_id=row["selectedDestination_id"],
                                      selecteddestination_name=selectedDestinationName,
                                      startfromlocation=row["startFromLocation"],
                                      departuredate=row["departureDate"],
                                      # Get data from python dictionary returned from geocode() function
                                      formatted_address=geocodeInfo['formatted_address'],
                                      lat=geocodeInfo['lat'],
                                      lng=geocodeInfo['lng'],
                                      postalcode=geocodeInfo['postalCode'],
                                      postalcodemapped=zipCodeMapped,
                                      barriomapped=barrioMapped,
                                      valid=valid)
        # If valid is false, just fill in the information that we have from the pandas data frame
        else:
            databaseEntry = Itenerary(distinctkey=row["primary_key"],
                                      numberitinerariesreturned=row["numItinerariesReturned"],
                                      selecteddestination_id=row["selectedDestination_id"],
                                      selecteddestination_name=row["selectedDestination_name"],
                                      startfromlocation=row["startFromLocation"],
                                      departuredate=row["departureDate"],
                                      valid=valid)

        # Add the the information to a database.    
        session.add(databaseEntry)
        session.commit()
    
    else:
        print("Entry already inside database")

V5L
GW
V7R
V6E
WE
98104
271849
V5R
RC
Entry already inside database
V6B
CBD
V5Y
RP
98105
251100
99352
V5L
GW
V3B
V6A
CBD
Entry already inside database
V6T
V5K
HS
98027
98102
250206
V6R
KITS
V6K
KITS
V5N
KC
V5L
GW
V3M
90036
274517
V5K
HS
V6B
CBD
Entry already inside database
V6K
KITS
98372
98103
250788
V5Y
MP
V6E
CBD
V6B
CBD
V6C
CBD
V6B
CBD
V6H
FAIR
V5H
92840
V5V
KC
V6C
CBD
V5B
98052
271083
V5Y
MP
V5A
V8B
V5R
RC
Entry already inside database
V6R
KITS
V4E
V0N
V6B
CBD
V6B
CBD
V7Y
CBD
V5A
V6T
98164
271849
V3J
V5W
RP
V6T
98109
272022
V6T
V6T
V3J
V7E
90027
115295
V6G
WE
V6B
CBD
V3C
V5A
Entry already inside database
V5H
V6B
CBD
98105
272001
V9R
V3J
V3M
V6T
Entry already inside database
V6C
CBD
V5H
V7J
91731
762605
90024
118920
98104
271893
91030
V5N
GW
V5Z
SC
V6E
CBD
Entry already inside database
V7M
V7R
98106
344029
V7M
V3T
V7H
V6T
Entry already inside database
V6E
CBD
V6J
KITS
V6A
STR
V6B
CBD
V6T
V5R
RC
V6B
CBD
V6E
WE
V5N
KC
98109
272022
90503
V6G
WE
V6T
V5Y
RP
V6B
CBD
90401
763776
V7M
9001