### Goal 1:

Create independent iterators for each of the four files that contained cleaned up data, of the correct type (e.g. string, int, date, etc), and represented by a named tuple.

In [1]:
import csv
from datetime import datetime
from collections import namedtuple
from itertools import islice, chain, compress, groupby

> Read the files

In [2]:
def csv_reader(file_path, include_header=False):
    with open(file_path) as f: 
        rows = csv.reader(f)
        if include_header==False:
            next(f)     
        yield from rows

In [3]:
file_paths = ('./employment.csv', './personal_info.csv', './update_status.csv', './vehicles.csv')

> Parser

In [4]:
def parse_date(data, fmt='%Y-%m-%dT%H:%M:%SZ'):
    return datetime.strptime(data, fmt)

In [5]:
employment_parser = (str, str, str, str)
personal_info_parser = (str, str, str, str, str)
update_status_parser = (str, parse_date, parse_date)
vehicles_parser = (str, str, str, int)

parsers = (employment_parser, personal_info_parser, update_status_parser, vehicles_parser)

> Header

In [6]:
def extract_header(file_path):
    reader = csv_reader(file_path=file_path, include_header=True)
    return next(reader)

> Namedtuple

In [7]:
def create_namedtuple(file_path, typename):
    fields = extract_header(file_path=file_path)
    return namedtuple(typename=typename, field_names=fields)

In [8]:
employment_name = 'Employment'
personal_info_name = 'Personal'
update_status_name = 'UpdateStatus'
vehicle_name = 'Vehicle'

typenames = (employment_name, personal_info_name, update_status_name, vehicle_name)

> Iterator

In [9]:
def iter_file(file_path, typename, parser):
    nt = create_namedtuple(file_path, typename)
    reader = csv_reader(file_path)

    for row in reader:
        parsed_row = (parser_func(element) for parser_func, element in zip(parser, row))

        yield nt(*parsed_row)

In [10]:
# Independent iterator
employment_iter = iter_file(file_path='./employment.csv', typename=employment_name, parser=employment_parser)
personal_info_iter = iter_file(file_path='./personal_info.csv', typename=personal_info_name, parser=personal_info_parser)
update_status_iter = iter_file(file_path='./update_status.csv', typename=update_status_name, parser=update_status_parser)
vehicle_iter = iter_file(file_path='./vehicles.csv', typename=vehicle_name, parser=vehicles_parser)

---

### Goal 2:

Create a single iterable that combines all the columns from all four iterators.

In [11]:
employment_field = [True, True, True, True]
personal_info_field = [False, True, True, True, True]
update_status_field = [False, True, True]
vehicle_field = [False, True, True, True]

compressed_fields = tuple(chain(employment_field, personal_info_field, update_status_field, vehicle_field))

In [12]:
def extract_compressed_header(file_paths=file_paths):
    name = []
    for file_path in file_paths:
        reader = csv_reader(file_path=file_path, include_header=True)
        name += next(reader)
    return tuple(compress(name, compressed_fields))

In [13]:
def iter_compressed_file(file_paths, typenames, parsers, compressed_fields):
    # Get compressed header
    compressed_header = extract_compressed_header()
    compressed_nt = namedtuple('Compressed', compressed_header)

    # Zip tuples from each file iterator
    zipped_tuples = zip(*(iter_file(file_path, typename, parser) 
                          for file_path, typename, parser in zip(file_paths, typenames, parsers)))
    # Chain tuples
    merged_iter = (chain.from_iterable(zipped_tuple) for zipped_tuple in zipped_tuples)
    
    for row in merged_iter:
        yield compressed_nt(*tuple(compress(row, compressed_fields)))

In [14]:
compressed_iter = iter_compressed_file(file_paths=file_paths, typenames=typenames, parsers=parsers, compressed_fields=compressed_fields)

In [15]:
for row in islice(compressed_iter, 5):
    print(row, '\n')

Compressed(employer='Stiedemann-Bailey', department='Research and Development', employee_id='29-0890771', ssn='100-53-9824', first_name='Sebastiano', last_name='Tester', gender='Male', language='Icelandic', last_updated=datetime.datetime(2017, 10, 7, 0, 14, 42), created=datetime.datetime(2016, 1, 24, 21, 19, 30), vehicle_make='Oldsmobile', vehicle_model='Bravada', model_year=1993) 

Compressed(employer='Nicolas and Sons', department='Sales', employee_id='41-6841359', ssn='101-71-4702', first_name='Cayla', last_name='MacDonagh', gender='Female', language='Lao', last_updated=datetime.datetime(2017, 1, 23, 11, 23, 17), created=datetime.datetime(2016, 1, 27, 4, 32, 57), vehicle_make='Ford', vehicle_model='Mustang', model_year=1997) 

Compressed(employer='Connelly Group', department='Research and Development', employee_id='98-7952860', ssn='101-84-0356', first_name='Nomi', last_name='Lipprose', gender='Female', language='Yiddish', last_updated=datetime.datetime(2017, 10, 4, 11, 21, 30), cre

---
### Goal 3:

Identify any stale records, where stale simply means the record has not been updated since 3/1/2017 (e.g. last update date < 3/1/2017). Create an iterator that only contains current records (i.e. not stale) based on the `last_updated` field from the `status_update` file.

In [16]:
compressed_iter = iter_compressed_file(file_paths=file_paths, typenames=typenames, parsers=parsers, compressed_fields=compressed_fields)

In [17]:
non_stale_iter = filter(lambda x: x.last_updated > datetime(2017, 3, 1), compressed_iter)

In [18]:
for row in islice(non_stale_iter, 5):
    print(row.last_updated, '\n')

2017-10-07 00:14:42 

2017-10-04 11:21:30 

2017-03-28 12:38:29 

2018-02-19 01:34:33 

2017-07-24 08:58:52 



---
### Goal 4:

For non-stale records, generate lists of number of car makes by gender.

> Male iterator

In [19]:
compressed_iter = iter_compressed_file(file_paths=file_paths, typenames=typenames, parsers=parsers, compressed_fields=compressed_fields)
non_stale_iter = filter(lambda x: x.last_updated > datetime(2017, 3, 1), compressed_iter)
male = filter(lambda x: x.gender=='Male', non_stale_iter)

In [20]:
group_m = groupby(sorted(male, key=lambda x: x.vehicle_make), key=lambda x: x.vehicle_make)
group_m_car_make = {}

In [21]:
for key, group in group_m:
    group_m_car_make[key] = len(list(group))

In [22]:
group_m_car_make

{'Acura': 7,
 'Aptera': 1,
 'Aston Martin': 3,
 'Audi': 14,
 'Austin': 1,
 'BMW': 12,
 'Bentley': 3,
 'Buick': 13,
 'Cadillac': 9,
 'Chevrolet': 30,
 'Chrysler': 3,
 'Corbin': 1,
 'Daewoo': 1,
 'Dodge': 22,
 'Eagle': 1,
 'Ford': 40,
 'GMC': 28,
 'Geo': 2,
 'Honda': 9,
 'Hyundai': 8,
 'Infiniti': 7,
 'Isuzu': 3,
 'Jaguar': 4,
 'Jeep': 7,
 'Jensen': 1,
 'Kia': 5,
 'Lamborghini': 4,
 'Land Rover': 3,
 'Lexus': 6,
 'Lincoln': 5,
 'Lotus': 5,
 'Maserati': 3,
 'Maybach': 2,
 'Mazda': 13,
 'Mercedes-Benz': 19,
 'Mercury': 11,
 'Mitsubishi': 28,
 'Nissan': 6,
 'Oldsmobile': 5,
 'Panoz': 2,
 'Plymouth': 4,
 'Pontiac': 11,
 'Porsche': 4,
 'Rolls-Royce': 1,
 'Saab': 8,
 'Saturn': 3,
 'Scion': 1,
 'Smart': 1,
 'Subaru': 8,
 'Suzuki': 2,
 'Toyota': 21,
 'Volkswagen': 16,
 'Volvo': 10}

> Female iterator

In [23]:
compressed_iter = iter_compressed_file(file_paths=file_paths, typenames=typenames, parsers=parsers, compressed_fields=compressed_fields)
non_stale_iter = filter(lambda x: x.last_updated > datetime(2017, 3, 1), compressed_iter)
female = filter(lambda x: x.gender=='Female', non_stale_iter)

In [24]:
group_f = groupby(sorted(female, key=lambda x: x.vehicle_make), key=lambda x: x.vehicle_make)
group_f_car_make = {}

In [25]:
for key, group in group_f:
    group_f_car_make[key] = len(list(group))

In [26]:
group_f_car_make

{'Acura': 9,
 'Aston Martin': 2,
 'Audi': 13,
 'Austin': 1,
 'BMW': 12,
 'Bentley': 4,
 'Bugatti': 1,
 'Buick': 11,
 'Cadillac': 6,
 'Chevrolet': 42,
 'Chrysler': 6,
 'Dodge': 17,
 'Eagle': 1,
 'Ford': 42,
 'GMC': 22,
 'Geo': 1,
 'Honda': 8,
 'Hyundai': 4,
 'Infiniti': 9,
 'Isuzu': 3,
 'Jaguar': 3,
 'Jeep': 5,
 'Kia': 9,
 'Lamborghini': 2,
 'Land Rover': 8,
 'Lexus': 15,
 'Lincoln': 4,
 'Lotus': 5,
 'Mazda': 13,
 'Mercedes-Benz': 17,
 'Mercury': 5,
 'Mitsubishi': 22,
 'Morgan': 1,
 'Nissan': 12,
 'Oldsmobile': 8,
 'Panoz': 1,
 'Plymouth': 3,
 'Pontiac': 14,
 'Porsche': 3,
 'Rolls-Royce': 1,
 'Saab': 3,
 'Saturn': 3,
 'Scion': 2,
 'Subaru': 6,
 'Suzuki': 12,
 'Toyota': 20,
 'Volkswagen': 10,
 'Volvo': 13}