### The goal of this notebook is to provide a basic example on how to access the USDOT Intelligest Transportation System (ITS) Joint Program Office (JPO) Wyoming Connected Vehcile Pilot data throught the Data Program Sandbox.

Objective of the notebook
- Show how to access WY pilot data on the sandbox
- Conduct basic analysis with the data

----

### Load key libraries for the work 
**You will need install these library before running this notebook**
- boto3: AWS python library (http://boto3.readthedocs.io/en/latest/)
- pandas: Python Data Analysis Library (http://pandas.pydata.org)

In [1]:
import boto3
import pandas as pd

### Session variable for accessing S3

In [2]:
session = boto3.Session(profile_name='public')  #note provide is not need
s3 = session.resource('s3')
client = session.client('s3')

### List sub folders to determine which dates and times are avaliable

In [3]:
result =  client.list_objects(Bucket = 'usdot-its-cvpilot-public-data', Delimiter='/', Prefix='wydot/BSM/20170814')
for o in result.get('CommonPrefixes'):
        print ('subfolder : ', o.get('Prefix'))

subfolder :  wydot/BSM/20170814T194035Z/
subfolder :  wydot/BSM/20170814T194036Z/
subfolder :  wydot/BSM/20170814T194037Z/
subfolder :  wydot/BSM/20170814T194038Z/
subfolder :  wydot/BSM/20170814T194039Z/
subfolder :  wydot/BSM/20170814T194040Z/
subfolder :  wydot/BSM/20170814T194041Z/
subfolder :  wydot/BSM/20170814T194042Z/
subfolder :  wydot/BSM/20170814T194043Z/
subfolder :  wydot/BSM/20170814T194044Z/
subfolder :  wydot/BSM/20170814T194045Z/
subfolder :  wydot/BSM/20170814T194046Z/
subfolder :  wydot/BSM/20170814T194047Z/
subfolder :  wydot/BSM/20170814T194048Z/
subfolder :  wydot/BSM/20170814T194049Z/
subfolder :  wydot/BSM/20170814T194050Z/
subfolder :  wydot/BSM/20170814T194051Z/
subfolder :  wydot/BSM/20170814T194052Z/
subfolder :  wydot/BSM/20170814T194053Z/
subfolder :  wydot/BSM/20170814T194054Z/
subfolder :  wydot/BSM/20170814T194055Z/
subfolder :  wydot/BSM/20170814T194056Z/
subfolder :  wydot/BSM/20170814T194057Z/
subfolder :  wydot/BSM/20170814T194058Z/
subfolder :  wyd

### Function for pulling file keys from the ASW S3 bucket

In [4]:
def dir_keys(client, bucket, prefix='', filekeys=[]):
    """
    Lists all file keys from a given prefix in an S3 bucket.  If no prefix is given all file keys are returned

    :param client: S3 connection object
    :param bucket: Name of bucket to search
    :param prefix: Prefix for a given folder
    :param filekeys: list for filekeys
    :return: updated filekey list with added files from search
    """
    paginator = client.get_paginator('list_objects_v2')
    for result in paginator.paginate(Bucket=bucket, Delimiter='/', Prefix=prefix):
        if result.get('Contents') is not None:
            for file in result.get('Contents'):
                if file.get('Key') != 'unknownDataType':
                    filekeys.append(file.get('Key'))
        if result.get('CommonPrefixes') is not None:
            for subdir in result.get('CommonPrefixes'):
                dir_keys(client, bucket, subdir.get('Prefix'), filekeys)
    return filekeys


### Pull all file key for Aug 14 2017 at 19:43 and determine the number of files

In [5]:
filekeys = dir_keys(client, 'usdot-its-cvpilot-public-data', 'wydot/BSM/20170814T1943')
print('Total number of files:', str(len(filekeys)))

Total number of files: 114


### Download files to locate Directory to do analysis on

In [6]:
import os
cwd = os.getcwd()
local_directory = cwd + os.sep + 'tmp' + os.sep
if not os.path.exists(local_directory):
    os.makedirs(local_directory)
print('Directory Created: {}'.format(local_directory))

Directory Created: /Users/m28050/Projects/Current/RDE Enhancement/Tasks/Task4_New_System/Sandbox/notebooks/tmp/


### Download files to local folder

In [7]:
for file in filekeys:
    client.download_file('usdot-its-cvpilot-public-data', file, local_directory + file.split('/')[-1])
print('{} Files loaded to {}'.format(str(len(os.listdir(local_directory))), local_directory))

115 Files loaded to /Users/m28050/Projects/Current/RDE Enhancement/Tasks/Task4_New_System/Sandbox/notebooks/tmp/


### Look at one of the JSON files

In [8]:
!cat /tmp/wydot-filtered-bsm-1501782536777.json

{"coreData":{"msgCnt":109,"id":"D5960000","secMark":18000,"position":{"latitude":41.2970116,"longitude":-105.5945026,"elevation":2194.5},"accelSet":{"accelYaw":0.00},"accuracy":{},"speed":19.04,"heading":351.4500,"brakes":{"wheelBrakes":{"leftFront":false,"rightFront":false,"unavailable":true,"leftRear":false,"rightRear":false},"traction":"unavailable","abs":"unavailable","scs":"unavailable","brakeBoost":"unavailable","auxBrakes":"unavailable"},"size":{}},"partII":[{"id":"VEHICLESAFETYEXT","value":{"pathHistory":{"crumbData":[{"elevationOffset":-2.7,"latOffset":0.0014126,"lonOffset":-0.0002730,"timeOffset":7.80},{"elevationOffset":-11.1,"latOffset":0.0032465,"lonOffset":-0.0006066,"timeOffset":16.70}]},"pathPrediction":{"confidence":0.0,"radiusOfCurve":0.0}}}]}

### Combine the JSON files into a single JSON file

In [9]:
import glob

read_files = glob.glob(local_directory + "*.json")
with open(local_directory + "merged_file.json", "w") as outfile:
    data = []
    for f in read_files:
        data.append(open(f, "r").read())
    outfile.write("[" + ','.join(data[1:]) + "]")

### Load file into data structure and remove partII tag as that information will not be used in this analysis
*Data is in J2735 format http://standards.sae.org/j2735_201603/*

In [10]:
from pandas.io.json import json_normalize
import json

file_json = json.load(open(local_directory + "merged_file.json","r"))

for element in file_json: 
    del element['partII'] 

result = json_normalize(file_json)
result.head()

Unnamed: 0,coreData.accelSet.accelLong,coreData.accelSet.accelYaw,coreData.brakes.abs,coreData.brakes.auxBrakes,coreData.brakes.brakeBoost,coreData.brakes.scs,coreData.brakes.traction,coreData.brakes.wheelBrakes.leftFront,coreData.brakes.wheelBrakes.leftRear,coreData.brakes.wheelBrakes.rightFront,coreData.brakes.wheelBrakes.rightRear,coreData.brakes.wheelBrakes.unavailable,coreData.heading,coreData.id,coreData.msgCnt,coreData.position.elevation,coreData.position.latitude,coreData.position.longitude,coreData.secMark,coreData.speed
0,,0.0,unavailable,unavailable,unavailable,unavailable,unavailable,False,False,False,False,True,289.725,16C50000,85,2188.9,41.300245,-105.604298,37600,27.24
1,-0.03,0.05,unavailable,unavailable,unavailable,unavailable,unavailable,False,False,False,False,True,289.7,16C50000,86,2188.9,41.300253,-105.604329,37700,27.24
2,0.01,0.15,unavailable,unavailable,unavailable,unavailable,unavailable,False,False,False,False,True,289.775,16C50000,90,2188.8,41.300286,-105.604451,38100,27.26
3,,0.0,unavailable,unavailable,unavailable,unavailable,unavailable,False,False,False,False,True,289.85,16C50000,93,2188.8,41.300311,-105.604543,38400,27.28
4,0.15,0.02,unavailable,unavailable,unavailable,unavailable,unavailable,False,False,False,False,True,289.7875,16C50000,94,2188.7,41.300319,-105.604574,38500,27.32


### Do some basic analysis on the speed

In [11]:
result['coreData.speed'].describe()

count    114.000000
mean      26.838772
std        4.330284
min       21.060000
25%       22.895000
50%       26.130000
75%       29.910000
max       34.540000
Name: coreData.speed, dtype: float64