In [None]:
#classification exercise - housing.txt

In [None]:
#1a) Input file
import pandas as pd
path = 'G:\My Drive\CST461 Audit\housing.txt' #set the path to the datafile
data = pd.read_csv(path, header=None, names=['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','median_house_value','ocean_proximity']) #input the data into a dataframe

In [None]:
#1b) separate features and labels
import numpy as np

data = data.dropna(axis=0) #dropping the rows where some of the data is na (or nan or null)
cols = data.shape[1] #get the number of columns for easy use later
features = np.array(data.iloc[:,0:cols-2],np.float32) #split off the feature data into numpy array features
rows = features.shape[0] #get the number of rows for easy use later

In [None]:
#2a) convert labels to integers
labels = data['ocean_proximity'] #split off the label data into labels
#Convert labels to integers
labels_strings, labels_ints = np.unique(labels, return_inverse = True)
    #I took the above line of code from stackoverflow, and it works.
print(labels_strings) #The unique categorical strings
print(labels_ints[905:915]) #the integers representing the categorical strings
print(labels[905:915]) #the actual labels corresponding to the ints above

In [None]:
#2b) convert integer labels to categorical
from keras.utils import to_categorical
categorical_labels = to_categorical(labels_ints, dtype="float32") #convert to labels and to float32
print(categorical_labels)

In [None]:
#3) Separate data into training, validation, and test
#The print statements are for debugging.
#Import numpy.random to set up the random indices
from numpy.random import default_rng
#randomize the rows so that when we divide the data we aren't biased toward a particular set
rand_idx = default_rng().choice(rows, size = rows, replace=False)
features = features[rand_idx] #all th features
categorical_labels = categorical_labels[rand_idx] #all the labels
#The amount of data that goes into each set is somewhat arbitrary
#I've played with this datast a bit, and I don't see a lot of difference from using different size
    #train, val, and test sets. So I'll just randomize the size
from numpy import random
np.random.seed(1000) #I set the random seed in order to get reproducability.
small = random.uniform(0,.5) #pick a random float between 0 and .5
#That fraction of the data will be evenly divided between the validation and test sets.
#The rest will go to training
train_size = int((1-small) * rows) #
val_size = train_size+int(small/2 * rows)
#Set up the feature sets
features_train = features[0:train_size,:] #use the biggest chunk for training data
features_val = features[train_size:val_size,:] #half of what's left for validation
features_test = features[val_size:features.shape[0],:] #and the rest for testing
#set up the labels sets
categorical_labels_train = categorical_labels[0:train_size]
categorical_labels_val = categorical_labels[train_size:val_size]
categorical_labels_test = categorical_labels[val_size:features.shape[0],:]

In [None]:
#4) Convert features to float32
#Check for the types of data in each column
#I look at the first element in each column as a proxy for the rest of them
for i in range(features.shape[1]):
    print(type(features[0,i]))
#That says that at least the first value in each column is a float

In [None]:
#5) Assemble/create ANN
from keras import models
from keras import layers

#There are a lot of options here
#I've tried a lot of different things here, so I'm not too worried about the number of
    #layers or the sizes of them
model = models.Sequential()
#input dense layer
model.add(layers.Dense(33, activation='tanh', input_shape=(8,))) #for whatever reason the input requires a hard coded 
    #number rather than "features_train.shape[i]". Personally I'd prefer the latter, but you do what you have to.
#interact with h2
model.add(layers.Dense(22, activation='relu'))
#interact with h3
model.add(layers.Dense(15, activation='tanh'))
#interact with h4
model.add(layers.Dense(10, activation='relu'))
#output softwax layer for each of the possible classifications.
model.add(layers.Dense(5, activation='softmax')) #for whatever reason the input requires a hard coded 
    #number rather than "labels_train.shape[j]". Personally I'd prefer the latter, but you do what you have to.

In [None]:
#6) Compile network. 
    #Use standard cross entropy
    #rmsprop
    #accuracy
model.compile(optimizer='rmsprop'
              ,loss='categorical_crossentropy'
              ,metrics=['accuracy'])

In [None]:
#Let's see a summary
print(model.summary())

In [None]:
#7) fit network. 
    #decide batch size 
    #num epochs
size_batch = random.randint(1024-128)+128 #get a random integer from 128 to 1024
print("batch size = ", size_batch)
num_epochs = random.randint(5)+5 #random integer from 5 to 10
print("num epochs = ", num_epochs)
history=model.fit(features_train,
                    categorical_labels_train, epochs=num_epochs, batch_size = size_batch,
                    validation_data=(features_val, categorical_labels_val))

In [None]:
#8) Plot loss and accuracy
history_dict = history.history
#print(history_dict.keys())
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
# "bo" is for "blue dot"
import matplotlib.pyplot as plt
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
plt.clf()
test_loss, test_acc = model.evaluate(features_test, categorical_labels_test)
print('test_loss:', test_loss)

In [None]:
plt.plot(epochs, acc, 'ro', label = 'Training Accuracy')
plt.plot(epochs, val_acc, 'r', label = 'Validation Accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
plt.clf()
print('test_acc:', test_acc)