In [1]:
import os
import cv2
import pandas as pd
import math
import numpy as np
import warnings
warnings.filterwarnings("ignore")

train_file = "train_data.csv"
test_file = "test_data.csv"

TRAIN_MODEL = True
MODEL_NAME = "trained_model_mlp.hdf5"

def load_data(file, direc="", sep=",", header=True):
    csv_path = os.path.join(direc, file)
    if header:
        return pd.read_csv(csv_path, sep=sep, index_col=False)
    else:
        return pd.read_csv(csv_path, sep=sep, index_col=False, header=None)
    

In [2]:
train_data = load_data(train_file)

In [3]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,216,217,218,219,220,221,222,223,224,Sentiment
0,-0.293233,0.075336,-0.265466,-0.356092,0.06643,0.761425,-0.001033,-0.340051,-0.140389,-0.384717,...,-0.304356,0.110543,-0.257856,0.248262,0.304541,-0.454722,0.313093,0.096131,0.022209,1.0
1,-0.281317,-0.052475,-0.227652,-0.145945,0.129469,0.254899,0.192881,-0.210649,-0.311281,0.397568,...,0.147703,-0.144117,0.197492,0.157018,0.054766,-0.073626,0.093853,0.080051,-0.250832,0.0
2,0.065311,0.078494,0.018382,-0.024447,-0.185517,-0.06263,0.390812,0.222266,0.223175,-0.43704,...,0.391639,-0.040531,0.092544,-0.397565,0.517327,-0.164458,0.110916,0.456602,-0.073503,0.0
3,0.123459,0.029362,0.05882,-0.235124,0.156227,0.575867,0.26641,-0.644778,-0.04655,-0.047256,...,0.125449,-0.121503,-0.549419,0.024966,0.150538,-0.30221,0.330751,-0.321547,-0.021141,1.0
4,-0.417293,0.00713,0.499212,-0.30981,-0.079425,-0.137011,0.22843,-0.066997,0.284044,0.465361,...,0.143851,-0.092237,-0.244069,0.22202,-0.360323,-0.032631,-0.143966,-0.087031,-0.474025,1.0


In [4]:
test_data = load_data(test_file)

In [5]:
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,216,217,218,219,220,221,222,223,224,Sentiment
0,0.109306,0.192315,-0.032794,0.152092,0.108466,0.081162,-0.143557,0.087043,0.225055,-0.240789,...,-0.189616,0.036832,0.027846,-0.331823,-0.037055,0.222175,-0.41043,-0.210313,0.229882,1.0
1,-0.12446,0.135917,0.431114,0.262919,0.010424,0.204219,-0.222301,0.120114,-0.051119,-0.118828,...,-0.020186,0.1851,0.328118,0.036923,-0.085293,0.012201,-0.198617,0.033985,0.00332,0.0
2,0.211367,0.194651,-0.231363,-0.311015,0.092923,0.164169,0.003516,-0.143032,0.253581,-0.160928,...,-0.193897,0.304636,-0.146392,-0.102875,-0.131144,-0.182048,-0.040771,-0.037335,0.028673,0.0
3,0.082641,-0.154188,-0.111012,-0.165968,-0.01769,-0.15164,0.027495,-0.121806,0.32214,0.095643,...,-0.106269,-0.169688,0.20156,0.021454,0.247993,0.13258,0.07167,-0.093814,-0.1905,0.0
4,-0.022724,0.149019,0.319221,-0.033267,0.012244,-0.029595,0.236312,0.370445,-0.128876,0.155701,...,0.138224,0.084814,0.122257,0.099517,0.144641,0.234967,-0.030566,-0.088829,-0.206466,1.0


In [6]:
train_labels = np.int16(train_data["Sentiment"].copy().values)
train_features = train_data.drop("Sentiment", axis=1)

test_labels = np.int16(test_data["Sentiment"].copy().values)
test_features = test_data.drop("Sentiment", axis=1)

In [7]:
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()
scalar.fit(train_features)

train_features = scalar.transform(train_features)
test_features = scalar.transform(test_features)

In [8]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
def plot_roc_curve(clf_sets):
    for clf_set in clf_sets:
        y = clf_set[0]
        y_pred = clf_set[1]
        label = clf_set[2]
        fpr, tpr, thresholds = roc_curve(y, y_pred)
        plt.plot(fpr, tpr, linewidth=1, label=label)
    
    plt.plot([0,1],[0,1],'k--')
    plt.axis([0,1,0,1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="bottom right")
    plt.show()

In [9]:
X = train_features.copy()
Y = train_labels.copy()
X_test = test_features.copy()
Y_test = test_labels.copy()

In [10]:

# CNN Classifier
import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Dropout
from keras.optimizers import Adam
from sklearn.preprocessing import LabelBinarizer
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import load_model
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model

batch_size = 32
epochs = 25

size = np.int16(X.shape[1])

train_x = X.copy()
test_x = X_test.copy()

train_y = to_categorical(Y)
test_y = to_categorical(Y_test)

num_classes = train_y.shape[1]
droprate = 0.8

try:
    model = load_model(MODEL_NAME)
except:
    model = None
    
ACT = 'tanh'    
    
if model is None:
    model = Sequential()

    model.add(Dense(1024, activation=ACT, input_shape=(size,)))
    model.add(BatchNormalization())
    model.add(Dropout(droprate))

    model.add(Dense(512, activation=ACT))
    model.add(BatchNormalization())
    model.add(Dropout(droprate))
    
    model.add(Dense(512, activation=ACT))
    model.add(BatchNormalization())
    model.add(Dropout(droprate))

    model.add(Dense(256, activation=ACT))
    model.add(BatchNormalization())
    model.add(Dropout(droprate))
    
    model.add(Dense(128, activation=ACT))
    model.add(BatchNormalization())
    model.add(Dropout(droprate))
    
    model.add(Dense(64, activation=ACT))
    model.add(BatchNormalization())
    model.add(Dropout(droprate))
    
    model.add(Dense(32, activation=ACT))
    model.add(BatchNormalization())
    model.add(Dropout(droprate))
    
    model.add(Dense(16, activation=ACT))
    model.add(BatchNormalization())
    model.add(Dropout(droprate))

    model.add(Dense(num_classes, activation='softmax'))

    adam = Adam()
    
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
else:
    print(MODEL_NAME, " is restored.")

model.summary()

callbacks = [ModelCheckpoint(MODEL_NAME, monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='max', period=1)]

if TRAIN_MODEL:
    history = model.fit(train_x, train_y,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=0,
                        validation_data=(test_x, test_y),
                        callbacks=callbacks)
else:
    print("Opted not to train the model as TRAIN_MODEL is set to False. May be because model is already trained and is now being used for validation")
    


Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              16384     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024)              4096      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
batch_normalization_2 (Batch (None, 512)               2048      
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               131328    
__________

ValueError: Error when checking input: expected dense_1_input to have shape (15,) but got array with shape (225,)

In [None]:
saved_model = load_model(MODEL_NAME)
score = saved_model.evaluate(test_x, test_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
test_x.shape