!!!! IMPORTANT NOTE !!!!
Phase 1 and the first 3 code cells of Phase 2 must be ran before running Phase 3. It imports the data and normalizes it. It is not required to run all of Phase 2 in order to run Phase 3.

# PHASE I




In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
import tensorflow as tf
from google.colab import files
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from keras.optimizers import SGD
from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from keras.utils.vis_utils import plot_model



Bringing in the data file to assign as the dataset

---



In [None]:
#file_name = 'adult1.csv'
#files.upload()

#read the data to dataset
url = 'https://raw.githubusercontent.com/pango3001/AI/main/project/adult1.csv'

dataset = pd.read_csv(url, delimiter=',')

Making sure our data has the right height and width

In [None]:
# print off rows and columns to make sure the dimensions are correct
print("Rows: %s" % dataset.shape[0])
print("Columns: %s" % dataset.shape[1])

# head shows the first n amount of rows, in this case n=10
dataset.head(10)

In [None]:
plt.rcParams["figure.figsize"] = ((13,13))
dataset.hist()
plt.show()

In [None]:
dataset.describe()

In [None]:
columns = ['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'maritalStatus', 'relationship', 'race', 'sex', 'capitalGain', 'capitalLoss', 'hoursPerWeek', 'country', 'earnings']
c_length  = len(columns)
c_colors  = ["blue", "magenta", "yellow", "red", "cyan", "orange", "black", "blue", "magenta", "yellow", "red", "cyan", "orange", "black"]

import warnings
warnings.filterwarnings("ignore")

plt.figure(figsize=(10, 25))
for i, j, k in itertools.zip_longest(columns, range(c_length), c_colors):
    plt.subplot(6, 3, j+1)
    sns.distplot(dataset[i], color = k)
    plt.title(i) # Title of column represented
    plt.xlabel("") # clears the x axis label
    plt.subplots_adjust(hspace = 0.3)
    plt.axvline(dataset[i].mean(), color = "red", linestyle="dashed", label="Mean")
    plt.axvline(dataset[i].std(), color = "black", linestyle="dotted", label="Std Dev.")
    plt.axvline(dataset[i].median(), color = "green", linestyle="dashdot", label="Median")
    plt.legend(loc="upper right")

In [None]:
# Distribution of Math Scores
plt.figure(figsize=(17, 12))
axs = sns.countplot(dataset['earnings'], palette='magma')
plt.title('Distribution of Earnings', fontsize=12)
plt.xlabel('Earnings', fontsize=12)
plt.ylabel('Amount', fontsize=12)

Next I will create a correlation map

Honestly I may change my dataset after looking at this, I would like to see more variation

In [None]:
# Correlation
plt.figure(figsize=(15, 13))
sns.heatmap(data=dataset.corr(), annot=True, cmap="hot", linewidths=0.1, fmt=".2f", linecolor="black")
plt.title("Correlation Map", fontsize=20)
plt.tight_layout()
plt.show()

# Honestly I may change my dataset after looking at this, I would like to see more variation

In [None]:
from sklearn import preprocessing
# Normalizing data between (0-1)
x = dataset.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)

# Insert column names back into data
normalized_df = pd.DataFrame(x_scaled, columns=dataset.columns)


# Output the first 10 rows to check data is normalized
normalized_df.head(10)


In [None]:
normalized_df.hist()
plt.show()

#Illistrating the Nueral Networks


In [None]:
def draw_neural_net(ax, left, right, bottom, top, layer_sizes):
    #source: https://gist.github.com/craffel/2d727968c3aaebd10359
    n_layers = len(layer_sizes)
    v_spacing = (top - bottom)/float(max(layer_sizes))
    h_spacing = (right - left)/float(len(layer_sizes) - 1)
    # Nodes
    for n, layer_size in enumerate(layer_sizes):
        layer_top = v_spacing*(layer_size - 1)/2. + (top + bottom)/2.
        for m in range(layer_size):
            circle = plt.Circle((n*h_spacing + left, layer_top - m*v_spacing), v_spacing/4.,
                                color='w', ec='k', zorder=4)
            ax.add_artist(circle)
    # Edges
    for n, (layer_size_a, layer_size_b) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])):
        layer_top_a = v_spacing*(layer_size_a - 1)/2. + (top + bottom)/2.
        layer_top_b = v_spacing*(layer_size_b - 1)/2. + (top + bottom)/2.
        for m in range(layer_size_a):
            for o in range(layer_size_b):
                line = plt.Line2D([n*h_spacing + left, (n + 1)*h_spacing + left],
                                  [layer_top_a - m*v_spacing, layer_top_b - o*v_spacing], c='k')
                ax.add_artist(line)

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.gca()
ax.axis('off')
ax.set_title("Base Model")
draw_neural_net(ax, .1, .9, .1, .9, [13, 1])

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.gca()
ax.axis('off')
ax.set_title("Middle, Linear on lastlayer, Linear on all layers, Sigmoid on all layers")
draw_neural_net(ax, .1, .9, .1, .9, [13,6, 1])

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.gca()
ax.axis('off')
ax.set_title("Multiple layers (4 hidden)")
draw_neural_net(ax, .1, .9, .1, .9, [13,10,8,6,4, 1])

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.gca()
ax.axis('off')
ax.set_title("OverFitted Model")
draw_neural_net(ax, .1, .9, .1, .9, [130,60, 1])

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.gca()
ax.axis('off')
ax.set_title("Model with 2 hidden layers")
draw_neural_net(ax, .1, .9, .1, .9, [13,6,4,1])

# PHASE II

Splitting my data into training and validation sets

In [None]:
# Shuffling the dataset
dataset = dataset.sample(frac=1).reset_index(drop=True)

# Convert Pandas DataSet to Numpy Array
dataset_np = dataset.values
X = dataset.drop('earnings', axis='columns').values
Y = dataset['earnings'].values

In [None]:
# Index for 30%
index_30percent = int(0.3 * len(dataset_np[:, 0]))  
print(index_30percent)

# Split into validation (30%) and training (70%)
XVALID = X[:index_30percent, :]
YVALID = Y[:index_30percent]
XTRAIN = X[index_30percent:, :]
YTRAIN = Y[index_30percent:]

In [None]:
# Mean normalization
Xmin = XTRAIN.min(axis = 0) 
Xmax = XTRAIN.max(axis = 0) 
mean = XTRAIN.mean(axis = 0)
XTRAIN = (XTRAIN - mean) / (Xmax - Xmin)
XVALID = (XVALID - mean) / (Xmax - Xmin)

# Rescaling
Ymax = YTRAIN.max()
YTRAIN = YTRAIN / Ymax
YVALID = YVALID / Ymax

In [None]:
train_norm = pd.DataFrame(data=XTRAIN)
train_norm.columns = ['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'maritalStatus', 'relationship', 'race', 'sex', 'capitalGain', 'capitalLoss', 'hoursPerWeek', 'country']
train_norm.hist(figsize=(18,16))

Next we will set up the models

In [None]:
# Setting up the models

# Base model 1 input, 1 output
model_base = Sequential([ 
                    Dense(units=12, input_dim = len(XTRAIN[0, :]), activation='relu'),
                    Dense(units=1, activation='sigmoid')
])

# adding a middle layer
model_middle = Sequential([
                    Dense(units=12, input_dim = len(XTRAIN[0, :]), activation='relu'),
                    Dense(units=6, activation='relu'),
                    Dense(units=1, activation='sigmoid')
])

# adding another hidden layer
model_4_layers = Sequential([
                    Dense(units=12, input_dim = len(XTRAIN[0, :]), activation='relu'),
                    Dense(units=6, activation='relu'),
                    Dense(units=4, activation='relu'),
                    Dense(units=1, activation='sigmoid')
])

# adding multiple hidden layers
model_mult_layers = Sequential([
                    Dense(units=12, input_dim = len(XTRAIN[0, :]), activation='relu'),
                    Dense(units=10, activation='relu'),
                    Dense(units=8, activation='relu'),
                    Dense(units=6, activation='relu'),
                    Dense(units=4, activation='relu'),
                    Dense(units=1, activation='sigmoid')
])

# linear activation on last layer
model_linear_last = Sequential([
                    Dense(units=12, input_dim = len(XTRAIN[0, :]), activation='relu'),
                    Dense(units=6, activation='relu'),
                    Dense(units=1, activation='linear')
])

# linear activation on all layers
model_linear_all = Sequential([
                    Dense(units=12, input_dim = len(XTRAIN[0, :]), activation='linear'),
                    Dense(units=6, activation='linear'),
                    Dense(units=1, activation='linear')
])

# sigmoid activation on all layers
model_sigmoid_all = Sequential([
                    Dense(units=12, input_dim = len(XTRAIN[0, :]), activation='sigmoid'),
                    Dense(units=6, activation='sigmoid'),
                    Dense(units=1, activation='sigmoid')
])

# Overfitted model
model_overfitted = Sequential([
                    Dense(units=120, input_dim = len(XTRAIN[0, :]), activation='linear'),
                    Dense(units=60, activation='linear'),
                    Dense(units=1, activation='linear')
])




In [None]:
model_base.summary()

Now we must compile each model to prepare it for the .fit() function

In [None]:
model_base.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model_middle.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model_4_layers.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model_mult_layers.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model_linear_last.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model_linear_all.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model_sigmoid_all.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
model_overfitted.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

In [None]:
# Learn the model from training set
history1 = model_base.fit(XTRAIN, YTRAIN, epochs=100, batch_size=64, validation_data = (XVALID, YVALID),verbose=1)

In [None]:
history2 = model_middle.fit(XTRAIN, YTRAIN, epochs=100, batch_size=64, validation_data = (XVALID, YVALID),verbose=1)

In [None]:
history3 = model_4_layers.fit(XTRAIN, YTRAIN, epochs=100, batch_size=64, validation_data = (XVALID, YVALID),verbose=1)

In [None]:
history4 = model_mult_layers.fit(XTRAIN, YTRAIN, epochs=100, batch_size=64, validation_data = (XVALID, YVALID),verbose=1)

In [None]:
history5 = model_linear_last.fit(XTRAIN, YTRAIN, epochs=100, batch_size=64, validation_data = (XVALID, YVALID),verbose=1)

In [None]:
history6 = model_linear_all.fit(XTRAIN, YTRAIN, epochs=100, batch_size=64, validation_data = (XVALID, YVALID),verbose=1)

In [None]:
history7 = model_sigmoid_all.fit(XTRAIN, YTRAIN, epochs=100, batch_size=64, validation_data = (XVALID, YVALID),verbose=1)

In [None]:
history8 = model_overfitted.fit(XTRAIN, YTRAIN, epochs=100, batch_size=64, validation_data = (XVALID, YVALID),verbose=1)

In [None]:
def get_accuracy(model):
    scores = model.evaluate(XTRAIN, YTRAIN)
    percent = (scores[1] * 100.0)
    formatted_percent = "{:.2f}".format(percent)
    print("Training set Accuracy: ", formatted_percent , "%")

In [None]:
get_accuracy(model_base)

In [None]:
get_accuracy(model_middle)

In [None]:
get_accuracy(model_mult_layers)

In [None]:
get_accuracy(model_4_layers)

In [None]:
get_accuracy(model_linear_last)

In [None]:
get_accuracy(model_linear_all)

In [None]:
get_accuracy(model_sigmoid_all)

In [None]:
get_accuracy(model_overfitted)

In [None]:
def learning_curve(m_history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))
    ax1.plot(m_history.history['loss'])
    ax1.plot(m_history.history['val_loss'])
    ax1.set_title('Model Loss')
    ax1.set_ylabel('Loss')
    ax1.set_xlabel('Epoch')
    ax1.legend(['Training loss data', 'Validation loss data'], loc='upper right')

    ax2.plot(m_history.history['accuracy'])
    ax2.plot(m_history.history['val_accuracy'])
    ax2.set_title('Model Accuracy')
    ax2.set_ylabel('Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.legend(['Training accuracy data', 'Validation accuracy data'], loc='lower right')

    fig.show()

In [None]:
learning_curve(history1)

In [None]:
learning_curve(history2)

In [None]:
learning_curve(history3)

In [None]:
learning_curve(history4)

In [None]:
learning_curve(history5)

In [None]:
learning_curve(history6)

In [None]:
learning_curve(history7)

In [None]:
learning_curve(history8)

In [None]:
plot_model(model_base, show_shapes=True, show_layer_names=True)

In [None]:
fig = plt.figure(figsize=(12, 12))
ax = fig.gca()
ax.axis('off')
draw_neural_net(ax, .1, .9, .1, .9, [4, 7, 2])
