# Basic TensorFlow

About

Intro

In [1]:
import io
import os
import pickle
from urllib.request import urlretrieve

from zipfile import ZipFile

import numpy as np
import tensorflow as tf
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm

print('All modules imported.')

All modules imported.


Download the notMNIST training and test dataset

In [2]:
def download(url, file):
    """
    Download file from <url>
    """
    if not os.path.isfile(file):
        print('Downlading ' + file + '...')
        urlretrieve(url, file)
        print('Download Finished')

# ToDo: Add URL
# Download the training and test dataset
download('', 'notMNIST_train.zip')
download('', 'notMNIST_test.zip')

print('All files downloaded.')

All files downloaded.


Get the features and labels from the zip files

In [3]:
def uncompress_features_labels(file):
    """
    Uncompress features and labels from zip file
    """
    features = []
    labels = []

    with ZipFile(file) as zipf:
        filenames_pbar = tqdm(zipf.namelist(), unit='files')
        for filename in filenames_pbar:
            # Check if the file is a directory
            if not filename.endswith('/'):
                with zipf.open(filename) as image_file:
                    image = Image.open(image_file)
                    image.load()
                    # Load image data as 1 dimensional array
                    feature = np.array(image).flatten()

                # Get the the letter from the filename
                label = os.path.split(filename)[1][0]

                features.append(feature)
                labels.append(label)
    return np.array(features), np.array(labels)

# Get the features and labels from the zip files
train_features, train_labels = uncompress_features_labels('notMNIST_train.zip')
test_features, test_labels = uncompress_features_labels('notMNIST_test.zip')

print('All features and labels uncompressed.')

100%|██████████| 210001/210001 [00:34<00:00, 6100.99files/s]
100%|██████████| 10001/10001 [00:01<00:00, 6392.77files/s]

All features and labels uncompressed.





In [8]:
# Normalize the features
# Apply zero mean and zero variance scale to the image features
def normalize_data(data):
    #  ToDo: Problem 1 - Implement function to normalize data

train_features = normalize_data(train_features)
test_features = normalize_data(test_features)

# Test Cases
np.testing.assert_array_almost_equal(
    normalize_data(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])),
    np.array([-0.4, -0.3, -0.2, -0.099, 0.0, 0.099, 0.199, 0.3, 0.4, 0.5]),
    decimal=3)
np.testing.assert_array_almost_equal(
    normalize_data(np.array([-100, -30, -1000, -20, -20, -10, -10, -20, -10, -10])),
    np.array([9.5, 2.5, 99.5, 1.5, 1.5, 0.5, 0.5, 1.5, 0.5, 0.5]))

print('Tests Passed!')

Tests Passed!


In [9]:
# Get randomized datasets for training and validation
train_features, valid_features, train_labels, valid_labels = train_test_split(
    train_features,
    train_labels,
    test_size=0.05,
    random_state=832289)

print('Training features and labels randomized and split.')

Training features and labels randomized and split.


In [8]:
# Save the data for easy access
print('Saving data to pickle file...')
pickle_file = 'notMNIST.pickle'
if not os.path.isfile(pickle_file):
    try:
        with open('notMNIST.pickle', 'wb') as pfile:
            pickle.dump(
                {
                    'train_dataset': train_features,
                    'train_labels': train_labels,
                    'valid_dataset': valid_features,
                    'valid_labels': valid_labels,
                    'test_dataset': test_features,
                    'test_labels': test_labels,
                },
                pfile, pickle.HIGHEST_PROTOCOL)
    except Exception as e:
        print('Unable to save data to', pickle_file, ':', e)
        raise

print('Data cached in pickle file.')

Saving data to pickle file...
File saved


In [None]:
# Todo: Create a simple example with tensorflow
batch_size = 100
learning_rate = 0.01
epochs = 25


def logistic_classifier(input):
    """
    Create a logistic classifier
    """
    weight = tf.Variable(tf.zeros([784, 10]))
    bias = tf.Variable(tf.zeros([10]))

    # Linear Regression Function WX + b
    logits = tf.matmul(input, weight) + bias

    # Softmax turns the scores into probabilities, creating our prediction model
    return tf.nn.softmax(logits)

# The input is the images of letters. The images are 28 by 28 pixels. The image has a total size of 784. 28*28=784
x = tf.placeholder(tf.float32, [None, 784])
# The labels are A-J, which make a total of 10 classes
y = tf.placeholder(tf.float32, [None, 10])

prediction = logistic_classifier(x)

# Cross entropy
cross_entropy = -tf.reduce_sum(y * tf.log(prediction), reduction_indices=1)

# Training loss
cost = tf.reduce_mean(cross_entropy)

# Gradient Descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

# Create an operation that initializes all variables
init = tf.initialize_all_variables()

with tf.Session() as sess:
    sess.run(init)

    # The training cycle
    for epoch in range(epochs):
        avg_cost = 0.
        batch_count = int(len(train_features) / batch_size)

        for batch_i in range(int(ceil(len(train_features)/float(batch_size)))):
            front_batch = batch_i * batch_size
            feature_batch = train_features[front_batch: front_batch + batch_size]
            label_batch = train_labels[front_batch: front_batch + batch_size]

            # Run optimization, then run the cost operation
            _, c = sess.run(
                [optimizer, cost],
                feed_dict={x: feature_batch, y: label_batch})

            avg_cost += c / batch_count
        print('Epoch: {:>2} Cost: {}'.format(epoch, avg_cost))

    correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))

    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print("Accuracy:", accuracy.eval({x: test_features, y: test_labels}))