In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import tensorflow as tf

In [2]:
data = pd.read_csv('../00_DATA_preprocessed/data_tokenized_2classes.csv')
data = data.drop(['Unnamed: 0'], axis = 1)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,72497,72498,72499,72500,72501,72502,72503,72504,72505,target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9890,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [3]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['target'], axis = 1), data['target'], test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(7915, 72506) (1979, 72506) (7915,) (1979,)


In [4]:
X_val = X_train[:1000]
X_train = X_train[1000:]
y_val = y_train[:1000]
y_train = y_train[1000:]

In [5]:
X_val.shape

(1000, 72506)

In [6]:
input_dim = X_train.shape[1]

# TESTING

In [21]:
# test

model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape=(input_dim,)))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=3, validation_data = (X_val, y_val))
test_loss, test_acc = model.evaluate(X_test, y_test)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test Loss: 0.6800416111946106
Test Accuracy: 0.6568974256515503


In [10]:
import os
os.mkdir("/models")

# sequential models

In [29]:
def sequential_model(return_model = False):
    num_dense_layers = np.random.randint(1,5)
    units = [np.random.choice([128, 64, 32], replace = True) for i in range(num_dense_layers)]
    print('Num Dense Layers:', num_dense_layers)
    print('units:', units)
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=(input_dim,)))
    for i in range(num_dense_layers):
        model.add(tf.keras.layers.Dense(units[i], activation = "relu"))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss="binary_crossentropy",
                  optimizer=tf.keras.optimizers.Adam(1e-4),
                  metrics=["accuracy"])
    history = model.fit(X_train, y_train, epochs=10, validation_data = (X_val, y_val))
    test_loss, test_acc = model.evaluate(X_test, y_test)

    print('Test Loss:', test_loss)
    print('Test Accuracy:', test_acc)

    model.save(f"models/sequentialmodel_denselayers_{num_dense_layers}_units_{'_'.join([str(x) for x in units])}")

    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.xticks(range(1, 11))
    plt.savefig(f"graphs/sequentialmodel_denselayers_{num_dense_layers}_units_{'_'.join([str(x) for x in units])}.jpg")
    plt.clf()

    if return_model:
        return model, history

In [30]:
sequential_model()

In [32]:
for i in range(10):
    sequential_model()

# added dropout

In [34]:
def sequential_model_with_dropout(return_model = False):
    num_dense_layers = np.random.randint(1,5)
    units = [np.random.choice([128, 64, 32], replace = True) for i in range(num_dense_layers)]
    print('Num Dense Layers:', num_dense_layers)
    print('units:', units)
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=(input_dim,)))
    for i in range(num_dense_layers):
        model.add(tf.keras.layers.Dense(units[i], activation = "relu"))
        model.add(tf.keras.layers.Dropout(np.random.choice([0.3, 0.2, 0.1])))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss="binary_crossentropy",
                  optimizer=tf.keras.optimizers.Adam(1e-4),
                  metrics=["accuracy"])
    history = model.fit(X_train, y_train, epochs=10, validation_data = (X_val, y_val))
    test_loss, test_acc = model.evaluate(X_test, y_test)

    print('Test Loss:', test_loss)
    print('Test Accuracy:', test_acc)

    model.save(f"models/sequentialmodeldropout_denselayers_{num_dense_layers}_units_{'_'.join([str(x) for x in units])}")

    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.xticks(range(1, 11))
    plt.savefig(f"graphs/sequentialmodeldropout_denselayers_{num_dense_layers}_units_{'_'.join([str(x) for x in units])}.jpg")
    plt.clf()

    if return_model:
        return model, history

In [35]:
for i in range(15):
    sequential_model_with_dropout()