# T81-558: Applications of Deep Neural Networks
**Module 2: Python for Machine Learning**
* Instructor: [Jeff Heaton](https://sites.wustl.edu/jeffheaton/), School of Engineering and Applied Science, [Washington University in St. Louis](https://engineering.wustl.edu/Programs/Pages/default.aspx)
* For more information visit the [class website](https://sites.wustl.edu/jeffheaton/t81-558/).

# Module Video Material

Main video lecture:

* [Part 2.1: Dealing with Data in Python with Pandas](https://www.youtube.com/watch?v=Bj2m6hvRoNk&index=6&list=PLjy4p-07OYzulelvJ5KVaT2pDlxivl_BN)
* [Part 2.2: Machine Learning Background for Deep Learning, Keras and Tensorflow](https://www.youtube.com/watch?v=WCXzchgxi9c&list=PLjy4p-07OYzulelvJ5KVaT2pDlxivl_BN)
* [Part 2.3: Pandas and Machine Learning](https://www.youtube.com/watch?v=eZGunTjrHyA&list=PLjy4p-07OYzulelvJ5KVaT2pDlxivl_BN)

# Helpful Functions

You will see these at the top of every module.  These are simply a set of reusable functions that we will make use of.  Each of them will be explained as the semester progresses.  They are explained in greater detail as the course progresses.  Class 4 contains a complete overview of these functions.

In [1]:
import base64
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
from sklearn import preprocessing


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = f"{name}-{tv}"
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"


# Regression chart.
def chart_regression(pred, y, sort=True):
    t = pd.DataFrame({'pred': pred, 'y': y.flatten()})
    if sort:
        t.sort_values(by=['y'], inplace=True)
    plt.plot(t['y'].tolist(), label='expected')
    plt.plot(t['pred'].tolist(), label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean())
                          >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low


# This function submits an assignment.  You can submit an assignment as much as you like, only the final
# submission counts.  The paramaters are as follows:
# data - Pandas dataframe output.
# key - Your student key that was emailed to you.
# no - The assignment class number, should be 1 through 1.
# source_file - The full path to your Python or IPYNB file.  This must have "_class1" as part of its name.  
# .             The number must match your assignment number.  For example "_class2" for class assignment #2.
def submit(data,key,no,source_file=None):
    if source_file is None and '__file__' not in globals(): raise Exception('Must specify a filename when a Jupyter notebook.')
    if source_file is None: source_file = __file__
    suffix = '_class{}'.format(no)
    if suffix not in source_file: raise Exception('{} must be part of the filename.'.format(suffix))
    with open(source_file, "rb") as image_file:
        encoded_python = base64.b64encode(image_file.read()).decode('ascii')
    ext = os.path.splitext(source_file)[-1].lower()
    if ext not in ['.ipynb','.py']: raise Exception("Source file is {} must be .py or .ipynb".format(ext))
    r = requests.post("https://api.heatonresearch.com/assignment-submit",
        headers={'x-api-key':key}, json={'csv':base64.b64encode(data.to_csv(index=False).encode('ascii')).decode("ascii"),
        'assignment': no, 'ext':ext, 'py':encoded_python})
    if r.status_code == 200:
        print("Success: {}".format(r.text))
    else: print("Failure: {}".format(r.text))

# Pandas

[Pandas](http://pandas.pydata.org/) is an open source library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.  It is based on the [dataframe](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) concept found in the [R programming language](https://www.r-project.org/about.html).  For this class, Pandas will be the primary means by which data is manipulated in conjunction with neural networks.

The dataframe is a key component of Pandas.  We will use it to access the [auto-mpg dataset](https://archive.ics.uci.edu/ml/datasets/Auto+MPG).  This dataset can be found on the UCI machine learning repository.  For this class we will use a version of the Auto MPG dataset where I added column headers.  You can find my version [here](https://raw.githubusercontent.com/jeffheaton/t81_558_deep_learning/master/data/auto-mpg.csv).

This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University. The dataset was used in the 1983 American Statistical Association Exposition.  It contains data for 398 cars, including [mpg](https://en.wikipedia.org/wiki/Fuel_economy_in_automobiles), [cylinders](https://en.wikipedia.org/wiki/Cylinder_(engine)), [displacement](https://en.wikipedia.org/wiki/Engine_displacement), [horsepower](https://en.wikipedia.org/wiki/Horsepower) , weight, acceleration, model year, origin and the car's name.

The following code loads the MPG dataset into a dataframe:

In [2]:
# Simple dataframe
import os
import pandas as pd

path = "./data/"

filename_read = os.path.join(path, "auto-mpg.csv")
df = pd.read_csv(filename_read)
print(df[0:5])

    mpg  cylinders  displacement horsepower  weight  acceleration  year  \
0  18.0          8         307.0        130    3504          12.0    70   
1  15.0          8         350.0        165    3693          11.5    70   
2  18.0          8         318.0        150    3436          11.0    70   
3  16.0          8         304.0        150    3433          12.0    70   
4  17.0          8         302.0        140    3449          10.5    70   

   origin                       name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  


In [3]:
# Perform basic statistics on a dataframe.

import os
import pandas as pd

path = "./data/"

filename_read = os.path.join(path, "auto-mpg.csv")
df = pd.read_csv(filename_read, na_values=['NA', '?'])

# Strip non-numerics
df = df.select_dtypes(include=['int', 'float'])

headers = list(df.columns.values)
fields = []

for field in headers:
    fields.append({
        'name' : field,
        'mean': df[field].mean(),
        'var': df[field].var(),
        'sdev': df[field].std()
    })

for field in fields:
    print(field)

{'name': 'mpg', 'mean': 23.514572864321607, 'var': 61.089610774274405, 'sdev': 7.815984312565782}
{'name': 'cylinders', 'mean': 5.454773869346734, 'var': 2.893415439920003, 'sdev': 1.7010042445332119}
{'name': 'displacement', 'mean': 193.42587939698493, 'var': 10872.199152247384, 'sdev': 104.26983817119591}
{'name': 'horsepower', 'mean': 104.46938775510205, 'var': 1481.5693929745814, 'sdev': 38.49115993282849}
{'name': 'weight', 'mean': 2970.424623115578, 'var': 717140.9905256763, 'sdev': 846.8417741973268}
{'name': 'acceleration', 'mean': 15.568090452261307, 'var': 7.604848233611383, 'sdev': 2.757688929812676}
{'name': 'year', 'mean': 76.01005025125629, 'var': 13.672442818627143, 'sdev': 3.697626646732623}
{'name': 'origin', 'mean': 1.5728643216080402, 'var': 0.6432920268850549, 'sdev': 0.8020548777266148}


## Sorting and Shuffling Dataframes
It is possable to sort and shuffle.  

In [4]:
import os
import pandas as pd
import numpy as np

path = "./data/"

filename_read = os.path.join(path, "auto-mpg.csv")
df = pd.read_csv(filename_read, na_values=['NA', '?'])
#np.random.seed(42) # Uncomment this line to get the same shuffle each time
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,6,232.0,100.0,2945,16.0,73,1,amc hornet
1,26.8,6,173.0,115.0,2700,12.9,79,1,oldsmobile omega brougham
2,17.0,8,305.0,130.0,3840,15.4,79,1,chevrolet caprice classic
3,13.0,8,400.0,175.0,5140,12.0,71,1,pontiac safari (sw)
4,43.4,4,90.0,48.0,2335,23.7,80,2,vw dasher (diesel)
5,20.2,6,200.0,85.0,2965,15.8,78,1,ford fairmont (auto)
6,16.0,8,318.0,150.0,4498,14.5,75,1,plymouth grand fury
7,21.6,4,121.0,115.0,2795,15.7,78,2,saab 99gle
8,21.5,6,231.0,115.0,3245,15.4,79,1,pontiac lemans v6
9,21.5,3,80.0,110.0,2720,13.5,77,3,mazda rx-4


In [5]:
import os
import pandas as pd

path = "./data/"

filename_read = os.path.join(path, "auto-mpg.csv")
df = pd.read_csv(filename_read, na_values=['NA', '?'])
df = df.sort_values(by='name', ascending=True)
print(f"The first car is: {df['name'].iloc[0]}")
print(df[0:5])

The first car is: amc ambassador brougham
      mpg  cylinders  displacement  horsepower  weight  acceleration  year  \
96   13.0          8         360.0       175.0    3821          11.0    73   
9    15.0          8         390.0       190.0    3850           8.5    70   
66   17.0          8         304.0       150.0    3672          11.5    72   
315  24.3          4         151.0        90.0    3003          20.1    80   
257  19.4          6         232.0        90.0    3210          17.2    78   

     origin                     name  
96        1  amc ambassador brougham  
9         1       amc ambassador dpl  
66        1       amc ambassador sst  
315       1              amc concord  
257       1              amc concord  


## Saving a Dataframe

Many of the assignments in this course will require that you save a dataframe to submit to the instructor.  The following code performs a shuffle and then saves a new copy.

In [6]:
import os
import pandas as pd
import numpy as np

path = "./data/"

filename_read = os.path.join(path, "auto-mpg.csv")
filename_write = os.path.join(path, "auto-mpg-shuffle.csv")
df = pd.read_csv(filename_read, na_values=['NA', '?'])
df = df.reindex(np.random.permutation(df.index))
df.to_csv(filename_write, index=False) # Specify index = false to not write row numbers
print("Done")

Done


## Dropping Fields

Some fields are of no value to the neural network and can be dropped.  The following code removes the name column from the MPG dataset.

In [7]:
import os
import pandas as pd

path = "./data/"

filename_read = os.path.join(path, "auto-mpg.csv")
df = pd.read_csv(filename_read, na_values=['NA', '?'])

print(f"Before drop: {df.columns}")
df.drop('name', 1, inplace=True)
print(f"After drop: {df.columns}")

Before drop: Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'origin', 'name'],
      dtype='object')
After drop: Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'origin'],
      dtype='object')


## Calculated Fields

It is possible to add new fields to the dataframe that are calculated from the other fields.  We can create a new column that gives the weight in kilograms.  The equation to calculate a metric weight, given a weight in pounds is:

$ m_{(kg)} = m_{(lb)} \times 0.45359237 $

This can be used with the following Python code:

In [8]:
import os
import pandas as pd

path = "./data/"

filename_read = os.path.join(path, "auto-mpg.csv")
df = pd.read_csv(filename_read, na_values=['NA', '?'])
df.insert(1, 'weight_kg', (df['weight'] * 0.45359237).astype(int))
df

Unnamed: 0,mpg,weight_kg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,1589,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,1675,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,1558,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,1557,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,1564,8,302.0,140.0,3449,10.5,70,1,ford torino
5,15.0,1969,8,429.0,198.0,4341,10.0,70,1,ford galaxie 500
6,14.0,1974,8,454.0,220.0,4354,9.0,70,1,chevrolet impala
7,14.0,1955,8,440.0,215.0,4312,8.5,70,1,plymouth fury iii
8,14.0,2007,8,455.0,225.0,4425,10.0,70,1,pontiac catalina
9,15.0,1746,8,390.0,190.0,3850,8.5,70,1,amc ambassador dpl


# Field Transformation & Preprocessing

The data fed into a machine learning model rarely bares much similarity to the data that the data scientist originally received. One common transformation is to normalize the inputs.  A normalization allows numbers to be put in a standard form so that two values can easily be compared.  Consider if a friend told you that he received a $10 discount.  Is this a good deal?  Maybe.  But the value is not normalized.  If your friend purchased a car, then the discount is not that good.  If your friend purchased dinner, this is a very good discount!

Percentages are a very common form of normalization.  If your friend tells you they got 10% off, we know that this is a better discount than 5%.  It does not matter how much the purchase price was.  One very common machine learning normalization is the Z-Score:

$z = \frac{x - \mu}{\sigma} $

To calculate the Z-Score you need to also calculate the mean($\mu$) and the standard deviation ($\sigma$).  The mean is calculated as follows:

$\mu = \bar{x} = \frac{x_1+x_2+\cdots +x_n}{n}$

The standard deviation is calculated as follows:

$\sigma = \sqrt{\frac{1}{N} \sum_{i=1}^N (x_i - \mu)^2}, {\rm \ \ where\ \ } \mu = \frac{1}{N} \sum_{i=1}^N x_i$

The following Python code replaces the mpg with a z-score.  Cars with average MPG will be near zero, above zero is above average, and below zero is below average.  Z-Scores above/below -3/3 are very rare, these are outliers.

In [9]:
import os
import pandas as pd
from scipy.stats import zscore

path = "./data/"

filename_read = os.path.join(path, "auto-mpg.csv")
df = pd.read_csv(filename_read, na_values=['NA', '?'])
df['mpg'] = zscore(df['mpg'])
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,-0.706439,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,-1.090751,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,-0.706439,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,-0.962647,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,-0.834543,8,302.0,140.0,3449,10.5,70,1,ford torino
5,-1.090751,8,429.0,198.0,4341,10.0,70,1,ford galaxie 500
6,-1.218855,8,454.0,220.0,4354,9.0,70,1,chevrolet impala
7,-1.218855,8,440.0,215.0,4312,8.5,70,1,plymouth fury iii
8,-1.218855,8,455.0,225.0,4425,10.0,70,1,pontiac catalina
9,-1.090751,8,390.0,190.0,3850,8.5,70,1,amc ambassador dpl


## Missing Values

Missing values are a reality of machine learning.  Ideally every row of data will have values for all columns.  However, this is rarely the case.  Most of the values are present in the MPG database.  However, there are missing values in the horsepower column.  A common practice is to replace missing values with the median value for that column.  The median is calculated as described [here](https://www.mathsisfun.com/median.html).  The following code replaces any NA values in horsepower with the median:

In [10]:
import os
import pandas as pd

path = "./data/"

filename_read = os.path.join(path, "auto-mpg.csv")
df = pd.read_csv(filename_read, na_values=['NA', '?'])
med = df['horsepower'].median()
df['horsepower'] = df['horsepower'].fillna(med)
# df = df.dropna() # you can also simply drop NA values
print(f"horsepower has na? {pd.isnull(df['horsepower']).values.any()}")

horsepower has na? False


## Concatenating Rows and Columns
Rows and columns can be concatenated together to form new data frames.

In [11]:
# Create a new dataframe from name and horsepower

import os
import pandas as pd

path = "./data/"

filename_read = os.path.join(path, "auto-mpg.csv")
df = pd.read_csv(filename_read, na_values=['NA', '?'])
col_horsepower = df['horsepower']
col_name = df['name']
result = pd.concat([col_name, col_horsepower], axis=1)
result

Unnamed: 0,name,horsepower
0,chevrolet chevelle malibu,130.0
1,buick skylark 320,165.0
2,plymouth satellite,150.0
3,amc rebel sst,150.0
4,ford torino,140.0
5,ford galaxie 500,198.0
6,chevrolet impala,220.0
7,plymouth fury iii,215.0
8,pontiac catalina,225.0
9,amc ambassador dpl,190.0


In [12]:
import os
import pandas as pd
import numpy as np

path = "./data/"

filename_read = os.path.join(path, "auto-mpg.csv")
df = pd.read_csv(filename_read, na_values=['NA', '?'])
df = df.reindex(np.random.permutation(df.index)) # Usually a good idea to shuffle
mask = np.random.rand(len(df)) < 0.8
trainDF = pd.DataFrame(df[mask])
validationDF = pd.DataFrame(df[~mask])

print(f"Training DF: {len(trainDF)}")
print(f"Validation DF: {len(validationDF)}")

Training DF: 321
Validation DF: 77


## Training and Validation

It is very important that we evaluate a machine learning model based on its ability to predict data that it has never seen before.  Because of this we often divide the training data into a validation and training set.  The machine learning model will learn from the training data, but ultimately be evaluated based on the validation data.

* **Training Data** - **In Sample Data** - The data that the machine learning model was fit to/created from. 
* **Validation Data** - **Out of Sample Data** - The data that the machine learning model is evaluated upon after it is fit to the training data.

There are two predominant means of dealing with training and validation data:

* **Training/Validation Split** - The data are split according to some ratio between a training and validation (hold-out) set.  Common ratios are 80% training and 20% validation.
* **K-Fold Cross Validation** - The data are split into a number of folds and models.  Because a number of models equal to the folds is created out-of-sample predictions can be generated for the entire dataset.

### Training/Validation Split

The code below performs a split of the MPG data into a training and validation set.  The training set uses 80% of the data and the validation set uses 20%.

The following image shows how a model is trained on 80% of the data and then validated against the remaining 20%.

![Training and Validation](https://raw.githubusercontent.com/jeffheaton/t81_558_deep_learning/master/images/class_1_train_val.png "Training and Validation")


In [13]:
import os
import pandas as pd
import numpy as np

path = "./data/"

filename_read = os.path.join(path, "auto-mpg.csv")
df = pd.read_csv(filename_read, na_values=['NA', '?'])
df = df.reindex(np.random.permutation(df.index)) # Usually a good idea to shuffle
mask = np.random.rand(len(df)) < 0.8
trainDF = pd.DataFrame(df[mask])
validationDF = pd.DataFrame(df[~mask])

print(f"Training DF: {len(trainDF)}")
print(f"Validation DF: {len(validationDF)}")

Training DF: 307
Validation DF: 91


### K-Fold Cross Validation

There are several types of cross validation; however, k-fold is the most common.  The value K specifies the number of folds.  The two most common values for K are either 5 or 10.  For this course we will always use a K value of 5, or a 5-fold cross validation. A 5-fold validation is illustrated by the following diagram:

![K-Fold Crossvalidation](https://raw.githubusercontent.com/jeffheaton/t81_558_deep_learning/master/images/class_1_kfold.png "K-Fold Crossvalidation")

First, the data are split into 5 equal (or close to, due to rounding) folds.  These folds are used to generate 5 training/validation set combinations.  Each of the folds becomes the validation set once, and the remaining folds become the training sets.  This allows the validated results to be appended together to produce a final out-of-sample prediction for the entire dataset.  


The following code demonstrates a 5-fold cross validation:

In [14]:
import os
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np

path = "./data/"

filename_read = os.path.join(path, "auto-mpg.csv")
df = pd.read_csv(filename_read, na_values=['NA', '?'])
df = df.reindex(np.random.permutation(df.index))
kf = KFold(5)

fold = 1
for train_index, validate_index in kf.split(df):
    trainDF = pd.DataFrame(df.iloc[train_index, :])
    validateDF = pd.DataFrame(df.iloc[validate_index])
    print(f"Fold #{fold}, Training Size: {len(trainDF)}, Validation Size: {len(validateDF)}")
    fold += 1


Fold #1, Training Size: 318, Validation Size: 80
Fold #2, Training Size: 318, Validation Size: 80
Fold #3, Training Size: 318, Validation Size: 80
Fold #4, Training Size: 319, Validation Size: 79
Fold #5, Training Size: 319, Validation Size: 79


# Accessing Files Directly
It is possible to access files directly, rather than using Pandas.  For class assignments you should use Pandas; however, direct access is possible.  Using the CSV package, you can read the files in, line-by-line and process them.  Accessing a file line-by-line can allow you to process very large files that would not fit into memory.  For the purposes of this class, all files will fit into memory, and you should use Pandas for all class assignments.  

In [15]:
# Read a raw text file (avoid this)
import codecs
import os

path = "./data"

# Always specify your encoding! There is no such thing as "its just a text file".
# See... http://www.joelonsoftware.com/articles/Unicode.html
# Also see... http://www.utf8everywhere.org/
encoding = 'utf-8'
filename = os.path.join(path, "auto-mpg.csv")

c = 0

with codecs.open(filename, "r", encoding) as fh:
    # Iterate over this line by line...
    for line in fh:
        c += 1 # Only the first 5 lines
        if c > 5:
            break
        print(line.strip())

mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
18,8,307,130,3504,12,70,1,chevrolet chevelle malibu
15,8,350,165,3693,11.5,70,1,buick skylark 320
18,8,318,150,3436,11,70,1,plymouth satellite
16,8,304,150,3433,12,70,1,amc rebel sst


In [16]:
# Read a CSV file
import codecs
import os
import csv

encoding = 'utf-8'
path = "./data/"
filename = os.path.join(path, "auto-mpg.csv")

c = 0

with codecs.open(filename, "r", encoding) as fh:
    reader = csv.reader(fh)
    for row in reader:
        c += 1
        if c > 5: 
            break
        print(row)

['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'name']
['18', '8', '307', '130', '3504', '12', '70', '1', 'chevrolet chevelle malibu']
['15', '8', '350', '165', '3693', '11.5', '70', '1', 'buick skylark 320']
['18', '8', '318', '150', '3436', '11', '70', '1', 'plymouth satellite']
['16', '8', '304', '150', '3433', '12', '70', '1', 'amc rebel sst']


In [17]:
# Read a CSV, symbolic headers
import codecs
import os
import csv

path = "./data"

encoding = 'utf-8'
filename = os.path.join(path, "auto-mpg.csv")

c = 0

with codecs.open(filename, "r", encoding) as fh:
    reader = csv.reader(fh)

    # Generate header index using comprehension.
    # Comprehension is cool, but not necessarily a beginners feature of Python.
    header_idx = {key: value for (value, key) in enumerate(next(reader))}

    for row in reader:
        c += 1
        if c > 5:
            break
        print(f"Car Name: {row[header_idx['name']]}")

Car Name: chevrolet chevelle malibu
Car Name: buick skylark 320
Car Name: plymouth satellite
Car Name: amc rebel sst
Car Name: ford torino


In [18]:
# Read a CSV, manual stats
import codecs
import os
import csv
import math

path = "./data/"

encoding = 'utf-8'
filename_read = os.path.join(path, "auto-mpg.csv")
filename_write = os.path.join(path, "auto-mpg-norm.csv")

c = 0

with codecs.open(filename_read, "r", encoding) as fh:
    reader = csv.reader(fh)

    # Generate header index using comprehension.
    # Comprehension is cool, but not necessarily a beginners feature of Python.
    header_idx = {key: value for (value, key) in enumerate(next(reader))}
    headers = header_idx.keys()

    fields = {key: value for (key, value) in [(key, {'count':0, 'sum':0, 'variance':0}) for key in headers]}

    # Pass 1, means
    row_count = 0
    for row in reader:
        row_count += 1
        for name in headers:
            try:
                value = float(row[header_idx[name]])
                field = fields[name]
                field['count'] += 1
                field['sum'] += value
            except ValueError:
                pass

    # Calculate means, toss sums (part of pass 1)
    for field in fields.values():
        # If 90% are not missing (or non-numeric) calculate a mean
        if (field['count'] / row_count) > 0.9:
            field['mean'] = field['sum'] / field['count']
            del field['sum']

    # Pass 2, standard deviation & variance
    fh.seek(0)
    for row in reader:
        for name in headers:
            try:
                value = float(row[header_idx[name]])
                field = fields[name]
                # If we failed to calculate a mean, no variance.
                if 'mean' in field:
                    field['variance'] += (value - field['mean'])**2
            except ValueError:
                pass

    # Calculate standard deviation, keep variance (part of pass 2)
    for field in fields.values():
        # If no variance, then no standard deviation
        if 'mean' in field:
            field['variance'] /= field['count']
            field['sdev'] = math.sqrt(field['variance'])
        else:
            del field['variance']

    # Print summary stats
    for key in sorted(fields.keys()):
        print(f"{key}:{fields[key]}")

acceleration:{'count': 398, 'variance': 7.585740574732961, 'mean': 15.568090452261291, 'sdev': 2.7542223175940177}
cylinders:{'count': 398, 'variance': 2.8861455518799946, 'mean': 5.454773869346734, 'sdev': 1.698865960539558}
displacement:{'count': 398, 'variance': 10844.882068950259, 'mean': 193.42587939698493, 'sdev': 104.13876352708563}
horsepower:{'count': 392, 'variance': 1477.7898792169979, 'mean': 104.46938775510205, 'sdev': 38.442032714425984}
mpg:{'count': 398, 'variance': 60.93611928991693, 'mean': 23.514572864321615, 'sdev': 7.806159061274433}
name:{'count': 0, 'sum': 0}
origin:{'count': 398, 'variance': 0.6416757152597181, 'mean': 1.5728643216080402, 'sdev': 0.801046637381194}
weight:{'count': 398, 'variance': 715339.1287404363, 'mean': 2970.424623115578, 'sdev': 845.7772335198177}
year:{'count': 398, 'variance': 13.638089947223559, 'mean': 76.01005025125629, 'sdev': 3.6929784655780975}


# Module 2 Assignment

You can find the first assignment here: [assignment 2](https://github.com/jeffheaton/t81_558_deep_learning/blob/master/assignments/assignment_yourname_class2.ipynb)