<a href="https://colab.research.google.com/github/swilsonmfc/computational/blob/master/DataStructuring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Structuring
* Parsing & processing data for machine learning
  * Needs can move past pandas
  * Nested structures
  * Real-time systems
* Options - Lots of structures to choose from
  * Dictionaries
  * Tuples, Named Tuples
  * Classes
  * Data Classes
  * Attrs
* Evaluation Concerns
  * Type Safety  - Can we enforce numeric / string
  * Readability  - Degree to which we can understand the code
  * Immuatable   - Can we guard against accidental changes
  * Construction - Easy it easy to build our structure

# Setup

In [1]:
pip install attrs



In [2]:
import sys
from pprint import pprint as pp
from collections import namedtuple

from dataclasses import dataclass
from dataclasses import field
from typing import List

import attr

# Trip Model
* A trip has flights, hotels and car rentals
* Flight
  * Departure Airport, Date, Time
  * Arrival Airport, Date, Time
  * Airline, Flight Number
* Hotel
  * Property
  * Room Type
  * Checkin, Checkout
* Car
  * Company
  * Location
  * Car type
  * Pickup, Dropoff

# Tuples
* Tuples offer a way of grouping data together
* Tuples are immutable
* They lack type safety
* They lack readability

In [3]:
flight = ('PDX', 'LAX', '2020-10-01 12:00', '2020-10-01 15:00', 'AS', '231')
hotel  = ('Marriott', 'King', '2020-10-01', '2020-10-05')
trip = [flight, hotel]

In [4]:
pp(trip)

[('PDX', 'LAX', '2020-10-01 12:00', '2020-10-01 15:00', 'AS', '231'),
 ('Marriott', 'King', '2020-10-01', '2020-10-05')]


In [5]:
# Add a car

# Dictionary
* Advantages
  * Allow for flexible modeling
  * Easy to capture new / different requirements
  * Key - Value pairs add to expressiveness
* Disadvantages
  * Construction defines structure
  * Doesn't offer type safety
  * Mutable data type
* Fine for prototyping
* Avoid when complexity increases

In [6]:
trip = {}
trip['Flights'] = []
trip['Hotels']  = []
trip['Cars']    = []

flight = {}
flight['Departure'] = {}
flight['Departure']['Airport'] = 'PDX'
flight['Departure']['Departs'] = '2020-10-01 12:00'
flight['Arrival']  = {}
flight['Arrival']['Airport'] = 'LAX'
flight['Arrival']['Arrives'] = '2020-10-01 15:00'
flight['Airline'] = 'AS'
flight['FlightNumber'] = '231'
trip['Flights'].append(flight)

hotel = {}
hotel['Property'] = 'Marriott'
hotel['Room'] = 'King'
hotel['Checkin']  = '2020-10-01'
hotel['Checkout'] = '2020-10-05'
trip['Hotels'].append(hotel)

In [7]:
pp(trip)

{'Cars': [],
 'Flights': [{'Airline': 'AS',
              'Arrival': {'Airport': 'LAX', 'Arrives': '2020-10-01 15:00'},
              'Departure': {'Airport': 'PDX', 'Departs': '2020-10-01 12:00'},
              'FlightNumber': '231'}],
 'Hotels': [{'Checkin': '2020-10-01',
             'Checkout': '2020-10-05',
             'Property': 'Marriott',
             'Room': 'King'}]}


In [8]:
# Add a car

# Named Tuples
* Named tuples allow us to merge readability with immutability
* They're more friendly for construction

In [9]:
Departure = namedtuple('Departure', 'Airport Departs')
Arrival   = namedtuple('Arrival', 'Airport Arrives')
Flight    = namedtuple('Flight', 'Departure Arrival Airline FlightNumber')
Hotel     = namedtuple('Hotel', 'Property Room Checkin Checkout')

In [10]:
flight = Flight(Departure('PDX', '2020-10-01 12:00'), Arrival('LAX', '2020-10-01 15:00'), 'AS', '231')
hotel  = Hotel('Marriott', 'King', '2020-10-01', '2020-10-05')
trip   = [flight, hotel]

In [11]:
pp(trip)

[Flight(Departure=Departure(Airport='PDX', Departs='2020-10-01 12:00'), Arrival=Arrival(Airport='LAX', Arrives='2020-10-01 15:00'), Airline='AS', FlightNumber='231'),
 Hotel(Property='Marriott', Room='King', Checkin='2020-10-01', Checkout='2020-10-05')]


In [12]:
# Add a car

# Classes
* Classes allow up to dramatically increase our readability
* We can add behaviors (with functions)
* We can avoid duplication (Departure / Arrival)
* Amount of code increases 

In [13]:
class Airport():
  def __init__(self, code):
    self.code = code

class Location():
  def __init__(self, airport, date):
    self.airport = airport
    self.date = date

class Flight():
  def __init__(self, departure, arrival, airline, flight_number):
    self.departure = departure
    self.arrival = arrival
    self.airline = airline
    self.flight_number = flight_number

class Hotel():
  def __init__(self, property, room, checkin, checkout):
    self.property = property
    self.room = room
    self.checkin = checkin
    self.checkout = checkout

class Trip():
  def __init__(self):
    self.flights = []
    self.hotels  = []
  
  def add_flight(self, flight):
    self.flights.append(flight)
  
  def add_hotel(self, hotel):
    self.hotels.append(hotel)

In [14]:
flight = Flight(departure=Location('PDX', '2020-10-01 12:00'),
                arrival=Location('LAX', '2020-10-01 15:00'),
                airline='AS',
                flight_number='231')
hotel = Hotel('Marriott', 'King', '2020-10-01', '2020-10-05')

trip = Trip()
trip.add_flight(flight)
trip.add_hotel(hotel)

In [15]:
pp(trip)

<__main__.Trip object at 0x7fcae44d0278>


In [16]:
# Add a car

# Data Classes
* Reduces python class semantics with easy to follow variables
* Easy to annotate (appear more like structures)
* Type safety hints
* Collections require new syntax

In [17]:
@dataclass
class Location:
    airport: str
    date: str

@dataclass
class Flight:
  departure: Location
  arrival: Location
  airline: str
  flight_number: str

@dataclass
class Hotel:
  property: str
  room: str
  checkin: str
  checkout: str

@dataclass
class Trip:
  flights: List[Flight] = field(default_factory=list)
  hotels:  List[Hotel] = field(default_factory=list)

  def add_flight(self, flight):
    self.flights.append(flight)

  def add_hotel(self, hotel):
    self.hotels.append(hotel)

In [18]:
flight = Flight(Location('PDX', '2020-10-01 12:00'),
                Location('LAX', '2020-10-01 15:00'),
                'AS',
                '231')
hotel = Hotel('Marriott', 'King', '2020-10-01', '2020-10-05')

trip = Trip()
trip.add_flight(flight)
trip.add_hotel(hotel)

pp(trip)

Trip(flights=[Flight(departure=Location(airport='PDX', date='2020-10-01 12:00'), arrival=Location(airport='LAX', date='2020-10-01 15:00'), airline='AS', flight_number='231')], hotels=[Hotel(property='Marriott', room='King', checkin='2020-10-01', checkout='2020-10-05')])


In [19]:
# Add car

# Attrs
* Classes without boilerplate
* Similar to dataclasses
* Support immutable (Frozen)
* Support for validators

In [20]:
@attr.s
class Trip:
  flights = attr.ib(default=attr.Factory(list))
  hotels  = attr.ib(default=attr.Factory(list))

  def add_flight(self, flight):
    self.flights.append(flight)

  def add_hotel(self, hotel):
    self.hotels.append(hotel)

@attr.s
class Location:
  airport = attr.ib()
  date = attr.ib()

@attr.s
class Flight:
  departure = attr.ib(Location)
  arrival = attr.ib(Location)
  airline = attr.ib(str)
  flight_number = attr.ib(str)

@attr.s
class Hotel:
  property = attr.ib()
  room = attr.ib()
  checkin = attr.ib()
  checkout = attr.ib()

In [21]:
flight = Flight(Location('PDX', '2020-10-01 12:00'),
                Location('LAX', '2020-10-01 15:00'),
                'AS',
                '231')
hotel = Hotel('Marriott', 'King', '2020-10-01', '2020-10-05')

trip = Trip()
trip.add_flight(flight)
trip.add_hotel(hotel)

pp(trip)

Trip(flights=[Flight(departure=Location(airport='PDX', date='2020-10-01 12:00'), arrival=Location(airport='LAX', date='2020-10-01 15:00'), airline='AS', flight_number='231')], hotels=[Hotel(property='Marriott', room='King', checkin='2020-10-01', checkout='2020-10-05')])


In [22]:
# Add car

## Immutable

In [23]:
@attr.s(frozen=True)
class ImmutableFlight:
  departure = attr.ib(Location)
  arrival = attr.ib(Location)
  airline = attr.ib(str)
  flight_number = attr.ib(str)

f = ImmutableFlight(Location('PDX', '2020-10-01 12:00'),
                    Location('LAX', '2020-10-01 15:00'),
                    airline='AS',
                    flight_number='231')
# f.flight_number = '2231'

## Validation

In [24]:

@attr.s(frozen=True)
class ImmutableValidatingFlight:
  departure = attr.ib(Location)
  arrival = attr.ib(Location)
  airline = attr.ib(str)
  flight_number = attr.ib(str)

  @flight_number.validator
  def valid_flight(self, attribute, value):
    num = int(value)
    if num < 0 or num > 9999:
      raise ValueError('Flight must be integer from 1 and 9999')

f = ImmutableValidatingFlight(Location('PDX', '2020-10-01 12:00'),
                    Location('LAX', '2020-10-01 15:00'),
                    airline='AS',
                    flight_number=231)