In [1]:
#############################################Ethan L. Mines##############################
#############################################VandyHacks IV: October 20-22###############
import numpy as np
import cntk as C

####################################Hyperparameters#####################################
numType = np.float32
numLayers = 1
learning_rate = 0.1
training_ratio = 3 / 4
learnerFunction = C.sgd
mb_size = 25

np.random.seed(0)

In [None]:
def shuffle_arrays(*arrays):
    rand_state = np.random.get_state()
    for array in arrays:
        np.random.set_state(rand_state)
        np.random.shuffle(array)

In [6]:
def classifier(output_dim, input_var, num_layers):
    z = Dense(output_dim)(input_var) #FIXME: Change initial variable name
    for i in range(num_layers - 1):
        z = Dense(output_dim)(z)
    return z

In [2]:
def my_trainer(z, labels):    
    lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch)
    learner = learnerFunction(z.parameters, lr_schedule)
    loss = C.cross_entropy_with_softmax(z, labels)
    eval_error = c.classification_error(z, labels)
    trainer = C.Trainer(z, (loss, eval_error), [learner])
    return trainer

In [None]:
#######################Modified from Microsoft CNTK101 Tutorial#####################
def print_progress(trainer, mb_num, frequency, display = True):
    loss = "NA"
    if mb_num % frequency == 0:
        loss = trainer.previous_minibatch_loss_average
        eval_error = trainer.previous_minibatch_evaluation_average
        if display:
            print("Minibatch {0:04n}: Average Loss = {1:.4f}, Average Error = {2:.4f}".format(mb_num, loss, eval_error))
    return mb_num, loss, eval_error

In [None]:
###########################Modified from Microsoft CNTK101 Tutorial#################
from collections import defaultdict

def train_model(trainer, training_size, mb_size, features, labels, input_var, label_var):
    training_plot_data = defaultdict(list)
    progress_output_freq = 1000
    training_features = features[:training_size]
    training_labels = labels[:training_size]
    
    num_training_mbs = training_size // mb_size
    
    for i in range(num_training_mbs):
        start_index = i * training_minibatch_size
        end_index = start_index + training_minibatch_size
        feature_batch = training_features[start_index : end_index]
        label_batch = training_labels[start_index : end_index]
        trainer.train_minibatch({input_var : feature_batch, label_var : label_batch})
    
        mb_num, loss, error = print_progress(trainer, i, progress_output_freq)
        if loss != "NA":
            training_plotdata["Batch Number"].append(mb_num)
            training_plotdata["Loss"].append(loss)
            training_plotdata["Error"].append(error)
            
    return training_plotdata

def test_model(trainer, training_size, mb_size, features, labels, input_var, label_var):
    testing_plotdata = defaultdict(list)
    testing_features = features[training_size:]
    testing_labels = labels[training_size:]
    
    num_testing_mbs = len(features) - training_size
    for i in range(num_testing_mbs):
        start_index = i * mb_size
        end_index = start_index + mb_size
        feature_batch = testing_features[start_index : end_index]
        label_batch = testing_labels[start_index : end_index]
    
        eval_error = trainer.test_minibatch({input_var : feature_batch, label_var : label_batch})
    
        if eval_error != "NA":
            testing_plotdata["Batch Number"].append(i)
            testing_plotdata["Error"].append(eval_error)
            
    return testing_plotdata

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def graphCriteria(training_plotdata, testing_plotdata):

    training_plotdata["Average Loss"] = moving_average(training_plotdata["Loss"])
    training_plotdata["Average Error"] = moving_average(training_plotdata["Error"])

    testing_plotdata["Average Error"] = moving_average(testing_plotdata["Error"])

    plt.plot(training_plotdata["Batch Number"], training_plotdata["Average Loss"])
    plt.title("Cumulative Average Loss")
    plt.xlabel("Minibatch Number")
    plt.ylabel("Loss")
    plt.show()

    print("Average Training Loss = {0:.2f}".format(training_plotdata["Average Loss"][-1]))


    plt.hist(testing_plotdata["Loss"])
    plt.title("Testing Error")
    plt.xlabel("Minibatch Average Error")
    plt.ylabel("Frequency")
    plt.show()
    
    print("Average Testing Loss = {0:.2f}".format(testing_plotdata["Average Loss"][-1]))

In [None]:
def moving_average(loss_list):
    averages = np.cumsum(np.asarray(loss_list))
    for i in range(len(averages)):
        averages[i] = averages[i] / (i + 1)
    return averages

#Expects number of possible categories "num_categories" and category index "category" in range 0...num_categories-1
def toOneHot(column, indexDict):
    oneHot = np.zeros( (len(column), len(indexDict)), dtype = numType)
    
    for i in range(len(column))
        oneHot[i][ indexDict[column[i]] ] = 1
    
    return oneHot

#############Returns str->int dictionary where the int's are indices for one-hot vectors of categories in column
def extractCategories(column, categPath):
    dictionary = {}
    i = 0
    with open(categPath, "w") as categDict:
        for element in column:
            if element not in dictionary:
                dictionary[element] = i
                i += 1
                categDict.write(element + "\n")
    return dictionary

#One-hot encodes all the CSV data
#Also writes simple text files that can than be used to reconstruct the index dictionaries
def process_csv(source, model_prefix):
    data = np.genfromtxt(source, delimiter = ",", dtype = str)
    dictDir = "dicts/"
    
    #print(data[:5, 1])
    
    column_indices = {"month": 1,
                     "hour": 2,
                     "weekday": 3,
                     "zip_code": 4,
                     "crime": 7
                     }
    
    months = data[:, column_indices["month"]]
    hours = data[:, column_indices["hour"]]
    weekdays = data[:, column_indices["weekday"]]
    zips = data[:, column_indices["zip_code"]]
    crimes = data[:, column_indices["crime"]]
    
    monthsDict = extractCategories(months, dictDir + prefix + "_month_dict")
    hoursDict = extractCategories(hours, dictDir + prefix + "_hours_dict")
    weekdaysDict = extractCategories(weekdays, dictDir + prefix + "_weekdays_dict")
    zipsDict = extractCategories(zips, dictDir + prefix + "_zips_dict")
    crimesDict = extractCategories(crimes, dictDir + prefix + "_crimes_dict")
    
    months = toOneHot(months, monthsDict)
    hours = toOneHot(hours, hoursDict)
    weekdays toOneHot(weekdays, weekdaysDict)
    zips = toOneHot(zips, zipsDict)
    crimes = toOneHot(crimes, crimesDict)
    
    shuffle_arrays(months, hours, weekdays, zips, crimes)
    features = np.column_stack(months, hours, weekdays)
    
    return features, zips, crimes
    

In [14]:

def gen_models(source, model_prefix):

    features, zips, crimes = process_csv(source, model_prefix)
    
    #Model variables
    input = C.input_variable(num_features, dtype = numType)    
    zip_label = C.input_variable(num_zips, dtype = numType)
    crime_label = C.input_variable(num_crimes, dtype = numType)
    
    #Actual models
    zone_classifier = classifier(zip_label.shape[0], training_data, num_layers)
    crime_classifier = classifier(crime_label.shape[0], training_data, num_layers)
    
    zip_trainer = my_trainer(num_zone_classes, zone_classifier)
    crime_trainer = my_trainer(num_crime_classes, crime_classifier)
    
    num_samples = len(zips)
    num_training = int(num_samples * training_ratio)
    
    train_model(zip_trainer, num_training, mb_size, features, zips, input, zip_label)
    train_model(crime_trainer, num_training, mb_size, features, crimes, input, crime_label)
    
    test_model(zip_trainer, num_training, mb_size, features, zips, input, zip_label)
    test_model(crime_trainer, num_training, mb_size, features, crimes, input, crime_label)
    
    return zone_classifier, crime_classifier
    
gen_models("Dataset/cleanedArizona.csv")

['5' '5' '5' '5' '5']
['5' '5' '5' '5' '5' '5' '5' '5' '5' '5' '5' '5' '5' '5' '5' '6' '6' '6'
 '6' '6']


In [None]:
def write_model_files(source, model_prefix):

    zone_model, crime_model = gen_models(source)
    zone_model_path = model_prefix + "_zones.cmf"
    crime_model_path = model_prefix + "_crimes.cmf"
    
    zone_model.save(zone_model_path)
    print("Wrote " + zone_model_path)
    crime_model.save(crime_model_path)
    print("Wrote " + crime_model_path)
    


In [None]:
dataPath = "Dataset"
datafiles = ["cleanedArizona.csv"]
model_prefixes =["arizona"] # For naming saved model files

for i in len(datafiles):
    write_model_files(dataPath + "/" + datafiles[i], model_prefixes[i])