Zachary OBrien - CSCE 5310 - Gender Classifier

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV

%matplotlib inline
import pickle

Setup data

In [2]:
data_folder = 'E://eyetracker/metadata/'
datafile = 'replaced_with_type0_v2with_sex.csv'
data = pd.read_csv(data_folder + datafile, dtype={'Sample': object})
print(data)

     Sample  Mean_vel  Min_vel  Max_vel  Screen_time_off_percent  Sex
0     02765    0.0084      0.0   0.0714                  55.3277    0
1     02772    0.0288      0.0   0.1447                  49.0018    0
2     02773    0.0091      0.0   0.0591                  55.6081    0
3     02774    0.0072      0.0   0.1003                  47.8017    0
4     02776    0.0063      0.0   0.0515                  66.3137    0
...     ...       ...      ...      ...                      ...  ...
1469  03495    0.0217      0.0   0.0840                  61.0847    1
1470  03498    0.0166      0.0   0.0920                  94.1992    0
1471  03501    0.0089      0.0   0.0614                  58.3656    1
1472  03502    0.0665      0.0   0.3422                  82.7419    0
1473  03523    0.0039      0.0   0.0339                  64.1591    0

[1474 rows x 6 columns]


In [3]:
n_samples = len(data)
train_data = data.iloc[:,1:-1]
train_data = train_data.to_numpy()

train_results = data.iloc[:,-1:].to_numpy().reshape(n_samples,)
print(train_data)
print(train_results)

[[8.40000e-03 0.00000e+00 7.14000e-02 5.53277e+01]
 [2.88000e-02 0.00000e+00 1.44700e-01 4.90018e+01]
 [9.10000e-03 0.00000e+00 5.91000e-02 5.56081e+01]
 ...
 [8.90000e-03 0.00000e+00 6.14000e-02 5.83656e+01]
 [6.65000e-02 0.00000e+00 3.42200e-01 8.27419e+01]
 [3.90000e-03 0.00000e+00 3.39000e-02 6.41591e+01]]
[0 0 0 ... 1 0 0]


In [4]:
# X_train, X_test, y_train, y_test = train_test_split(
#     data, digits.target, stratify=True, test_size=0.20)
X_train, X_test, y_train, y_test = train_test_split(
    train_data, train_results, test_size=0.20)

outcomes = {}
train_scores = {}

In [5]:
print(X_train)
print(y_train)

[[4.20000e-03 0.00000e+00 4.39000e-02 4.90060e+01]
 [1.54000e-02 0.00000e+00 1.09400e-01 7.96832e+01]
 [8.00000e-04 0.00000e+00 1.23700e-01 4.50000e-01]
 ...
 [4.60000e-02 0.00000e+00 5.57500e-01 1.42415e+01]
 [3.20000e-03 0.00000e+00 5.33000e-02 6.75130e+01]
 [7.00000e-03 0.00000e+00 4.55000e-02 7.23724e+01]]
[0 1 1 ... 0 0 0]


Default SVC - Scores at bottom

In [6]:
svc_clf = svm.SVC()
svc_clf.fit(X_train, y_train)
outcomes["default_svc"] = svc_clf.predict(X_test)

Default KNN - Scores at bottom

In [7]:
k_clf = KNeighborsClassifier()
k_clf.fit(X_train, y_train)
outcomes["default_k"] = k_clf.predict(X_test)

Default Decision Tree - Scores at bottom

In [8]:
dtree_clf = DecisionTreeClassifier()
dtree_clf.fit(X_train , y_train)
outcomes["default_dtree"] = dtree_clf.predict(X_test)

Default Logistic Regression - Scores at bottom

In [9]:
log_clf = LogisticRegression(max_iter=10000, solver='liblinear')
log_clf.fit(X_train, y_train)
outcomes["default_log"] = log_clf.predict(X_test)

Default for GridSearchCV is 5 fold cross-validation

SVC w/ Cross Validation

In [10]:
c_range = [10**x for x in range(-7, 7)]
gamma_range = [10**x for x in range(-7, 7)]
svc_parameters = {'C': c_range, 'gamma': gamma_range, 'kernel': ['rbf']}
svc_cross = GridSearchCV(svc_clf, svc_parameters)
svc_cross.fit(X_train, y_train)
print("tuned hpyerparameters :(best parameters) ",svc_cross.best_params_)
print("accuracy :",svc_cross.best_score_)
train_scores['svc_search'] = svc_cross.best_score_
outcomes['svc_tuned'] = svc_cross.predict(X_test)

tuned hpyerparameters :(best parameters)  {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
accuracy : 0.5385647313379012


In [23]:
with open("svc_cross.pkl", "wb") as f:
    pickle.dump(svc_cross, f)

KNN w/ Cross Validation

In [11]:
k_parameters = {'n_neighbors': [1,3,5,7,9,11,13]}
k_cross = GridSearchCV(k_clf, k_parameters)
k_cross.fit(X_train, y_train)
print("tuned hpyerparameters :(best parameters) ",k_cross.best_params_)
print("accuracy :",k_cross.best_score_)
train_scores['k_search'] = k_cross.best_score_
outcomes['k_tuned'] = k_cross.predict(X_test)

tuned hpyerparameters :(best parameters)  {'n_neighbors': 9}
accuracy : 0.5479192210602235


In [22]:
with open("k_cross.pkl", "wb") as f:
    pickle.dump(k_cross, f)

Decision Tree w/ Cross Validation

In [12]:
dtree_parameters = {'min_samples_split': [1,3,5,7,9,11,13]}
dtree_cross = GridSearchCV(dtree_clf, dtree_parameters)
dtree_cross.fit(X_train, y_train)
print("tuned hpyerparameters :(best parameters) ",dtree_cross.best_params_)
print("accuracy :",dtree_cross.best_score_)
train_scores['dtree_search'] = dtree_cross.best_score_
outcomes['dtree_tuned'] = dtree_cross.predict(X_test)

tuned hpyerparameters :(best parameters)  {'min_samples_split': 11}
accuracy : 0.5267435989902631


5 fits failed out of a total of 35.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\zacha\PycharmProjects\CSCE5310_gaze_capture\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\zacha\PycharmProjects\CSCE5310_gaze_capture\venv\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "C:\Users\zacha\PycharmProjects\CSCE5310_gaze_capture\venv\lib\site-packages\sklearn\tree\_classes.py", line 250, in fit
    raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; 

In [21]:
with open("d_tree.pkl", "wb") as f:
    pickle.dump(dtree_cross, f)

Linear Regression w/ Cross Validation

In [13]:
c_range = [10**x for x in range(-7, 7)]
log_parameters = {"C":c_range, "penalty":["l1"]}
log_cross = GridSearchCV(log_clf, log_parameters)
log_cross.fit(X_train, y_train)
print("tuned hpyerparameters :(best parameters) ",log_cross.best_params_)
print("accuracy :",log_cross.best_score_)
train_scores['log_search'] = log_cross.best_score_
outcomes['log_tuned'] = log_cross.predict(X_test)

tuned hpyerparameters :(best parameters)  {'C': 10, 'penalty': 'l1'}
accuracy : 0.524186801298233


In [20]:
with open("logistic_regression.pkl", "wb") as f:
    pickle.dump(log_cross, f)

Function to more easily plot confusion matricies

In [14]:
# from https://www.kaggle.com/grfiv4/plot-a-confusion-matrix
def plot_confusion_matrix(y_true, y_pred, cm, classes=None,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Only use the labels that appear in the data
    if classes:
        classes = classes[unique_labels(y_true, y_pred)]
    else:
        classes = unique_labels(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    #     print("Normalized confusion matrix")
    # else:
    #     print('Confusion matrix, without normalization')
    print(title, "Confusion Matrix")
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax


Fetch the scores

In [15]:
for key in train_scores.keys():
    print("Score for", key, "is", round(train_scores[key]*100, 3), "%")
for key in outcomes.keys():
    cm = confusion_matrix(y_test, outcomes[key])
    #plot_confusion_matrix(y_test, outcomes[key], cm=cm, title="{}".format(key))
    correct = 0
    incorrect = 0
    for i in range(len(cm)):
        for j in range(len(cm[i])):
            if i == j:
                correct += cm[i][j]
            else:
                incorrect += cm[i][j]
    print("score for {}".format(key), round((correct / (correct + incorrect)) * 100, 3), "% accurate")

Score for svc_search is 53.856 %
Score for k_search is 54.792 %
Score for dtree_search is 52.674 %
Score for log_search is 52.419 %
score for default_svc 50.169 % accurate
score for default_k 52.542 % accurate
score for default_dtree 53.22 % accurate
score for default_log 54.576 % accurate
score for svc_tuned 50.847 % accurate
score for k_tuned 49.492 % accurate
score for dtree_tuned 50.847 % accurate
score for log_tuned 53.559 % accurate


In [16]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(10, activation='relu', input_shape=(4, )),
  #tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10)
])
#X_train, X_test, y_train, y_test
predictions = model(X_train[:1]).numpy()
tf.nn.softmax(predictions).numpy()
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
loss_fn(y_train[:1], predictions).numpy()

model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=500)

TensorFlow version: 2.8.0
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76

<keras.callbacks.History at 0x230cced1af0>

In [17]:
model.evaluate(X_test,  y_test, verbose=2)

10/10 - 0s - loss: 0.6921 - accuracy: 0.5186 - 329ms/epoch - 33ms/step


[0.6920514702796936, 0.5186440944671631]

In [18]:
model.save("./models/dense_nn")

INFO:tensorflow:Assets written to: ./models/dense_nn\assets
