# More Unit Testing

In [1]:
import numpy as np
import matplotlib
%matplotlib inline
import pandas as pd
import sys
libraries = (('Matplotlib', matplotlib), ('Numpy', np), ('Pandas', pd))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

Python Version: 3.7.1 (default, Dec 14 2018, 13:28:58) 
[Clang 4.0.1 (tags/RELEASE_401/final)] 

Matplotlib Version: 3.0.2
Numpy Version: 1.15.4
Pandas Version: 0.23.4


In [9]:
# !mkdir data
!wget -nc -P data https://s3.amazonaws.com/gamma-datasets/P2/mta_turnstile_160903.txt https://s3.amazonaws.com/gamma-datasets/P2/mta_turnstile_160910.txt https://s3.amazonaws.com/gamma-datasets/P2/mta_turnstile_160917.txt

--2019-10-16 23:57:08--  https://s3.amazonaws.com/gamma-datasets/P2/mta_turnstile_160903.txt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.26.230
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.26.230|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25301340 (24M) [text/plain]
Saving to: ‘data/mta_turnstile_160903.txt’


2019-10-16 23:57:21 (2.06 MB/s) - ‘data/mta_turnstile_160903.txt’ saved [25301340/25301340]

--2019-10-16 23:57:21--  https://s3.amazonaws.com/gamma-datasets/P2/mta_turnstile_160910.txt
Reusing existing connection to s3.amazonaws.com:443.
HTTP request sent, awaiting response... 200 OK
Length: 25529149 (24M) [text/plain]
Saving to: ‘data/mta_turnstile_160910.txt’


2019-10-16 23:57:25 (6.86 MB/s) - ‘data/mta_turnstile_160910.txt’ saved [25529149/25529149]

--2019-10-16 23:57:25--  https://s3.amazonaws.com/gamma-datasets/P2/mta_turnstile_160917.txt
Reusing existing connection to s3.amazonaws.com:443.
HTTP request sent, awaiting

## Exercise: UnitTesting with Real Data

We're going to revisit the MTA data and get started with building some unit tests together. I'm providing the tests in the TestDataLoader class, you need to write a function that 
* takes in a list of week IDs as input
* loads the dataframe corresponding to those week IDs (check out the data folder) and combines them
* returns the single dataframe

You should be able to pass all of the tests. Note that some of them require some minimal cleaning already before returning things!

In [3]:
def load_data_into_dataframe():
    pass

In [4]:
def clean_column_names(df):
    new_cols = [col.strip() for col in df.columns]
    df.columns = new_cols
    return df

def load_data_into_dataframe(week_nums):
    if type(week_nums) not in [tuple, list, set]:
        raise TypeError("Input Files must be a list")
    filename = "./data/mta_turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = filename.format(week_num)
        dfs.append(pd.read_csv(file_url))
    df = pd.concat(dfs)
    df = clean_column_names(df)
    return df

In [5]:
import unittest

class TestDataLoader(unittest.TestCase):
    
    def test_fails_without_file_list(self):
        with self.assertRaises(TypeError):
            load_data_into_dataframe()
        with self.assertRaises(TypeError):
            load_data_into_dataframe(160903)
    
    def test_output_type(self):
        self.assertIs(type(load_data_into_dataframe([160903])), type(pd.DataFrame()))
        
    def test_column_names(self):
        df = load_data_into_dataframe([160903])
        bool_cols = (df.columns == ['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES','EXITS'])
        self.assertTrue(bool_cols.all())
        
    def test_multiple_files_of_data(self):
        df = load_data_into_dataframe([160903,160910])
        self.assertIs(type(df), type(pd.DataFrame()))

unittest.main(TestDataLoader(), argv=['first-arg-is-ignored'], exit=False)
# Note that this time I added the name of the testing class as an arg so it only runs that tester!

E.EE
ERROR: test_column_names (__main__.TestDataLoader)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-5-2ea43de4ec22>", line 15, in test_column_names
    df = load_data_into_dataframe([160903])
  File "<ipython-input-4-d7ba1a220dbe>", line 13, in load_data_into_dataframe
    dfs.append(pd.read_csv(file_url))
  File "/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 678, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 440, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 787, in __init__
    self._make_engine(self.engine)
  File "/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 1014, in _make_engine
    self._engine = CParserWrapper(self.f, **self.options)
  File "/anaconda3/lib/python3.7/site-package

<unittest.main.TestProgram at 0x116bb3a58>

## Exercise 2: Writing the function and the Tests

Now your goal is to write both the functions and the tests. The goal here is that we're going to write a function to clean and prepare our data. The function should:

* Take in a dataframe
* Create a DATE_TIME column using the DATE and TIME columns
* Make sure that each grouping of ["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"] is unique

For tests, you should write tests to check the output types of columns, check that the uniqueness values are being handled properly, as well as any other tests you can think of. 

In ~15 minutes, we'll have someone come up and present both their code and their tests and other folks can chime in about the types of tests they've written as well.

In [6]:
df = load_data_into_dataframe([160917])

FileNotFoundError: File b'./data/mta_turnstile_160917.txt' does not exist

In [None]:
def clean_dataframe(df):
    df['DATE_TIME'] = pd.to_datetime(df.DATE + " " + df.TIME, format="%m/%d/%Y %H:%M:%S")
    df.drop_duplicates(subset=["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"], inplace=True)
    return df

In [7]:
class TestDataCleaner(unittest.TestCase):
    
    def test_fails_without_df(self):
        with self.assertRaises(TypeError):
            clean_dataframe()
    
    def test_output_type(self):
        self.assertIs(type(clean_dataframe(df)), type(pd.DataFrame()))
        
    def test_column_types(self):
        clean_df = clean_dataframe(df)
        self.assertTrue("DATE_TIME" in clean_df.columns)
        self.assertTrue(clean_df['DATE_TIME'].dtype == '<M8[ns]')
        
    def test_row_uniqueness(self):
        max_entries = (clean_dataframe(df)
                         .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
                         .ENTRIES.count()
                         .reset_index()
                         .sort_values("ENTRIES", ascending=False)).head(1)['ENTRIES'].iloc[0]
        self.assertTrue(max_entries == 1)
        
    def test_date_time_conversion(self):
        converted_date_of_known_test_row = clean_dataframe(df)['DATE_TIME'].iloc[0]
        self.assertTrue(converted_date_of_known_test_row == pd.to_datetime('2016-09-10 00:00:00'))

unittest.main(TestDataCleaner(), argv=['first-arg-is-ignored'], exit=False)

EEEEE
ERROR: test_column_types (__main__.TestDataCleaner)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-7-77906038626d>", line 11, in test_column_types
    clean_df = clean_dataframe(df)
NameError: name 'clean_dataframe' is not defined

ERROR: test_date_time_conversion (__main__.TestDataCleaner)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-7-77906038626d>", line 24, in test_date_time_conversion
    converted_date_of_known_test_row = clean_dataframe(df)['DATE_TIME'].iloc[0]
NameError: name 'clean_dataframe' is not defined

ERROR: test_fails_without_df (__main__.TestDataCleaner)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-7-77906038626d>", line 5, in test_fails_without_df
    clean_dataframe()
NameError: name 'clean_dataframe' is not define

<unittest.main.TestProgram at 0x116b5acc0>

>Instructor note: Flat files are large but kept locally in the repo to support this exercise. They are [also](https://s3.amazonaws.com/gamma-datasets/P2/mta_turnstile_160903.txt) [hosted](https://s3.amazonaws.com/gamma-datasets/P2/mta_turnstile_160910.txt) [here](https://s3.amazonaws.com/gamma-datasets/P2/mta_turnstile_160917.txt) in the Gamma S3 bucket as backup.