# Obfuscated Text Classification

## Summary of Approach
As this is an NLP problem, the classification can be done by
1. Regular NeuralNetwork
2. Recurrent Neural Network
3. Convolutional Neural Network

As the training set is obfuscated, the limitation of using RNN is I cannot use word embeddings and thus I can only build a character level model.

First, a simple neural network is trained but the model performed poorly with an accuracy of less than 15%

Then a convolution model is built based on paper by See Zhang and LeCun, 2015 for character level convolutional neural networks.

I slightly fine tuned the model by modifying the convolutional layers outputs to 512 instead of 256 and 1024 as described in the paper.

The model was performing well for training set but not so well for validation set as the data has huge imbalance

## Handling data imbalance

To balance the dataset, oversampling of the classes which has less number of samples is donbe by duplicating the examples.

Rather than removing the samples for classes which have excess, Oversampling is done. This is because the training set is limited and I don't want to loose more information.

*Another approach I tried to employ is building LSTM for each class and then using generative models to oversample the data instead of duplicating. *
*As LSTM was character based, it was taking longer time, I didn't have resources to train such a network and hence didn't use that technique. (For training a single class, for one epoch LSTM ETA was almost 10 hours.)*


The submission also contains the weight file 'weight_cnn_oversample_4.h5' which can be used for the model 

char_cnn2(n_vocab, max_len, n_classes, weights_path=None)

The predictions for test data are in 'ytest.txt'

In [2]:
import pandas as pd
from keras.models import Model
from keras.layers import Dense, Input, Dropout, MaxPooling1D, Conv1D, MaxPooling1D
from keras.layers import LSTM, Lambda, Bidirectional, BatchNormalization
from keras.layers import TimeDistributed
from keras.optimizers import Adam
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Conv2D,ZeroPadding2D,Input,Activation
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers import Input, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D
from keras.layers import Embedding,ThresholdedReLU
from keras import optimizers
from keras.utils.np_utils import to_categorical
import random
import keras.backend as K
import numpy as np
import re
import keras.callbacks
import sys
import os

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [234]:
from keras.layers import TimeDistributed, Bidirectional

In [3]:
from collections import Counter
import math

In [4]:
# For Reproducing consistent results
random.seed(1)

In [5]:
# Reading train data from txt
train_df = pd.read_table('xtrain_obfuscated.txt',header=None,delim_whitespace=True)

In [6]:
# Reading train labels from txt
train_labels = pd.read_table('ytrain.txt',header=None,delim_whitespace=True)

In [7]:
train_df[1] = train_labels
train_df.columns = ['text','label']

In [39]:
def prepare_data(xtrain, ytrain, xtest, max_len=None):
    """
    Preprocess and featurize the data
    """

    xtrain = [line.lower() for line in xtrain]
    xtest = [line.lower() for line in xtest]
    ytrain = [int(line) for line in ytrain]
    
    def chars(dataset):
        return reduce(
            lambda x, y: x.union(y),
            (set(line) for line in dataset))
    
    def onehot(dataset, max_len, vocab_size):
        hot = np.zeros((len(dataset), max_len, vocab_size), dtype=np.bool)
        i = 0
        for line in dataset:
            j = 0
            for char in line:
                if char != 0:
                    hot[i, j, char] = 1.

                j += 1
            i += 1

        return hot
  
    # get all chars used in train as well as test
    letters = chars(xtrain).union(chars(xtest))

    # determine the maximum text length. in this regime, we are not truncating
    # texts at all. in the paper texts are truncated.
    max_text_length = np.max([np.max(list(map(len, ls))) for ls in [xtrain, xtest]])
    max_len = max_len or max_text_length

    # distinct letters and classes in the dataaset
    vocab = sorted(list(letters))
    classes = sorted(list(set(ytrain)))
    # lookup tables for letters and classes. prepends padding char
    idx_letters = dict(((c, i) for c, i in zip(vocab, range(len(vocab)))))
    idx_classes = dict(((c, i) for c, i in zip(classes, range(len(classes)))))

    # dense integral indices
    xtrain = [[idx_letters[char] for char in list(line)] for line in xtrain]
    xtest = [[idx_letters[char] for char in list(line)] for line in xtest]
    ytrain = [idx_classes[line] for line in ytrain]

    # pad to fixed lengths
    xtrain = sequence.pad_sequences(xtrain, max_len)
    xtest = sequence.pad_sequences(xtest, max_len)

    xtrain = onehot(xtrain, max_len, len(idx_letters))
    ytrain = to_categorical(ytrain, nb_classes=len(classes))
    xtest = onehot(xtest, max_len, len(idx_letters))

    return (
        xtrain,
        ytrain,
        xtest,
        vocab,
        max_len,
        len(classes))

In [154]:
def lines(filename):
        with open(filename) as f:
            return f.read().splitlines()

In [217]:
def fit(model, xtrain, ytrain, batch=128, epochs=5, split=0.1,class_weights=None,validation_data=None):
    "fit the model"

    return model.fit(xtrain,
                     ytrain,
                     batch_size=batch,
                     nb_epoch=epochs,
                     validation_split=split,
                    class_weight = class_weights,
                    validation_data = validation_data)

In [11]:
def compiled(model):
    "compile with chosen config"

    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

In [12]:
def predict(model, X):
    "predict probability, class for each instance"

    # predict probability of each class for each instance
    all_preds = model.predict(X)

    # for each instance get the index of the class with max probability
    idxs = np.argmax(all_preds, axis=1)

    # get the values of the highest probability for each instance
    preds = [all_preds[i, idxs[i]] for i in range(len(idxs))]

    return np.array(preds), idxs


In [13]:
# Method to set class weights for imbalanced data
def set_class_weights(labels):
    class_counts = Counter(labels)
    class_weights = {}
    label_len = len(labels)
    max_index = max(class_counts, key=class_counts.get)
    for item,i in enumerate(class_counts):
        class_weights[i] = round(class_counts[max_index]/class_counts[i])
    max_index = max(class_weights, key=class_weights.get)
    #Setting weights proportional to inversely proportional to max_value
    #for item,i in enumerate(class_weights):
        #class_weights[i] = class_weights[max_index]/(class_weights[i]+0.1) #Adding 0.1 to tackle if value 0
    return class_weights

In [105]:
# Resampling the input data based on class imbalance
def build_resample0(labels,label_counts,df):
    max_count = np.max(label_counts)
    new_df = df.copy()
    for label in labels:
        label_df = df.loc[df['label']==label].copy(deep=True)
        diff = max_count-label_counts.iloc[label]
        to_add = float(diff)/float(label_counts.loc[label])
        fraction_samples = to_add if to_add<1 else to_add-round(to_add)
        for i in range(0,int(round(to_add))):
            new_df = new_df.append(label_df)
        if fraction_samples >0:
            samples = int(round(label_df.shape[0]*fraction_samples))
            frac_df = label_df.iloc[:samples,:]
            new_df = new_df.append(frac_df)
    return new_df

# Simple Neural Network
Trying a simple neural network and checking if how the model preforms

In [15]:
simple_model = Sequential()
simple_model.add(Dense(300, input_dim=452, init="uniform",activation="relu"))
simple_model.add(Dense(200, init="uniform", activation="relu"))
simple_model.add(Dense(12))
simple_model.add(Activation("softmax"))

In [16]:
sgd = optimizers.SGD(lr=0.8, decay=1e-6, momentum=0.9, nesterov=True)

In [17]:
simple_model.compile(sgd, 'categorical_crossentropy', metrics=['accuracy'])

In [18]:
def get_unique_chars(df):
    charset={''}
    for item in df:
        charset.add(''.join(set(item)))
    charset = set(''.join(charset))
    return charset

In [19]:
def text_to_numeric(text,char_set):
    numeric = []
    i=0
    for char in text:
        numeric.append(str(char_set.index(char)+1))
    return numeric

In [20]:
train_features = train_df['text']
train_labels = train_df['label']

In [21]:
charset = list(get_unique_chars(train_features))

In [22]:
train_numeric = []
for item in train_features:
    train_numeric.append(text_to_numeric(item,charset))

In [23]:
padded_train_features = sequence.pad_sequences(train_numeric, maxlen=452, dtype='int32',
    padding='post', truncating='post', value=0.)

In [24]:
padded_train_features.shape

(32513, 452)

In [25]:
simple_model.fit(padded_train_features, keras.utils.np_utils.to_categorical(train_labels), nb_epoch=4, batch_size=50,validation_split=0.2)

Train on 26010 samples, validate on 6503 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f47c4c7ded0>

# Simple Model is Poor
Simple model performs poorly. Eventhogh there is less overfitting, the accuracy levels are very less and not improving in every epoch.
Try to define a more complex model

In [26]:
simple_model = Sequential()
simple_model.add(Dense(10000, input_dim=452, init="uniform",activation="relu"))
simple_model.add(Dense(8000,activation="relu"))
simple_model.add(Dense(6000, activation="relu"))
simple_model.add(Dense(4000,activation="relu"))
simple_model.add(Dense(2000,activation="relu"))
simple_model.add(Dense(1000,activation="relu"))
simple_model.add(Dense(500,activation="relu"))
simple_model.add(Dense(12))
simple_model.add(Activation("softmax"))

In [27]:
simple_model.compile(sgd, 'categorical_crossentropy', metrics=['accuracy'])

In [28]:
simple_model.fit(padded_train_features, keras.utils.np_utils.to_categorical(train_labels), nb_epoch=2, batch_size=50,validation_split=0.2)

Train on 26010 samples, validate on 6503 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f47c3a78d10>

# Still Poor
Incresing model complexity doesn't increse the performance

Trying to onehot encode train data and fit the model

In [40]:
xsimpletrain,ysimpletrain,testsimpletrain,vocab, max_len, n_classes = prepare_data(
        lines('xtrain_obfuscated.txt'),
        lines('ytrain.txt'),
        lines('xtest_obfuscated.txt'))
train_x_flatten = xsimpletrain.reshape(xsimpletrain.shape[0], -1) 
train_x_flatten.T.shape

(11752, 32513)

In [31]:
simple_model = Sequential()
simple_model.add(Dense(10000, input_dim=train_x_flatten.shape[1], activation="relu"))
simple_model.add(Dense(8000,activation="relu"))
simple_model.add(Dense(6000, activation="relu"))
simple_model.add(Dense(4000,activation="relu"))
simple_model.add(Dense(2000,activation="relu"))
simple_model.add(Dense(1000,activation="relu"))
simple_model.add(Dense(500,activation="relu"))
simple_model.add(Dense(12))
simple_model.add(Activation("softmax"))

In [32]:
simple_model.compile(sgd, 'categorical_crossentropy', metrics=['accuracy'])

In [33]:
simple_model.fit(train_x_flatten, keras.utils.np_utils.to_categorical(train_labels), nb_epoch=2, batch_size=50,validation_split=0.2)

Train on 26010 samples, validate on 6503 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f47c2985b90>

Model still doesn't improve for each epoch. Trying different architecture

# Changing the model architecture

As the train data is obfuscated, character level models can be fitted

In [34]:
# Model is inspired from paper "See Zhang and LeCun, 2015" for character level CNN
def char_cnn1(n_vocab, max_len, n_classes, weights_path=None):
    model = Sequential()
    model.add(Conv1D(256, 7, activation='relu', input_shape=(max_len, n_vocab)))
    model.add(MaxPooling1D(3))

    model.add(Conv1D(256, 7, activation='relu'))
    model.add(MaxPooling1D(3))

    model.add(Conv1D(256, 3, activation='relu'))
    model.add(Conv1D(256, 3, activation='relu'))
    model.add(Conv1D(256, 3, activation='relu'))
    model.add(Conv1D(256, 3, activation='relu'))
    model.add(MaxPooling1D(3))

    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(n_classes, activation='softmax'))

    if weights_path:
        model.load_weights(weights_path)

    return model

In [41]:
xcnn1,ycnn1,testcnn1,vocab, max_len, n_classes = prepare_data(
        lines('xtrain_obfuscated.txt'),
        lines('ytrain.txt'),
        lines('xtest_obfuscated.txt'))

In [159]:
model1_cnn1 = compiled(char_cnn1(len(vocab), max_len, n_classes))

In [38]:
char_cnn1_fit1 = fit(model1_cnn1, xcnn1, ycnn1, epochs=20)

Train on 29261 samples, validate on 3252 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [43]:
model1_cnn1.save_weights('weights_cnn1_fit1.h5')

# CNN1 Intuition

As the model is ran for 20 epochs, the good thing is the train accuracy of the model is increasing after each epoch.

However, theere is a gap between train and validation accuracy. The nmodel seem to be overfitting.

Inorder to reduce overfitting, first chack if the dataset has imbalance

In [44]:
imbalance = Counter(train_labels)

In [45]:
imbalance

Counter({0: 543,
         1: 3459,
         2: 1471,
         3: 4023,
         4: 2337,
         5: 2283,
         6: 4226,
         7: 5097,
         8: 3634,
         9: 980,
         10: 3052,
         11: 1408})

It is clear that, the dataset is highly skewed. 

Inorder to tackle this, set class weights proportional to the level of imbalance.
For example: If label 7 is 15% of dataset, and label 1 is 2%, then setting weight of 1 to label 7 and 13 to label 1

In [46]:
class_weights = set_class_weights(train_labels)

In [47]:
class_weights

{0: 9.0,
 1: 1.0,
 2: 3.0,
 3: 1.0,
 4: 2.0,
 5: 2.0,
 6: 1.0,
 7: 1.0,
 8: 1.0,
 9: 5.0,
 10: 1.0,
 11: 3.0}

# Fitting the model with class weights

In [48]:
char_cnn1_fit1 = fit(model1_cnn1, xcnn1, ycnn1, epochs=5, class_weights=class_weights)

Train on 29261 samples, validate on 3252 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [49]:
char_cnn1_fit1 = fit(model1_cnn1, xcnn1, ycnn1, epochs=5, class_weights=class_weights)

Train on 29261 samples, validate on 3252 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Intuition2

Balancing the class weights doesn't effect the model performance. There is no improvement from the previous model.

So, now the train data need to be balanced by under sampling or oversampling.

Undersampling can be reducing the number of examples of certain labels and making the training set balanced. The problem with this approach is, we will end up loosing lot of information for training and might lead to increrase in bias.

Oversampling is reverse process to undersampling. Whereas here, we duplicate examples for labels which are less in number so that the training set is balanced. 

One important point while oversampling is, seperate the validation set before doing oversmpling so that the validation set doesn't have duplicates and hence will give the exact performance of the model



In [157]:
#Shuffling the data once again to make sure data is not arranged in any order
train_df = train_df.iloc[np.random.permutation(len(train_df))]

In [57]:
# splitting train and val samples
val_sample_cnn1 = train_df.iloc[:int(round(train_df.shape[0]*0.1)),:]

In [58]:
trn_sample_cnn1 = train_df.iloc[int(round(train_df.shape[0]*0.1)):,:]

In [59]:
trn_sample_cnn1.columns= ['text','label']

In [61]:
label_counts = trn_sample_cnn1.label.value_counts()

In [63]:
sample_unique_labels = trn_sample_cnn1.label.unique()

In [114]:
sample_unique_labels
label_counts.loc[7]

4619

In [183]:
# Balancing the sample data by duplicating data for labels which are less in number
resampled_trn_cnn1 = build_resample(sample_unique_labels,label_counts,trn_sample_cnn1)

In [174]:
def build_resample(labels,label_counts,df):
    max_count = np.max(label_counts)
    new_df = df.copy(deep=True)
    samples=0
    for label in labels:
        label_df = df.loc[df['label']==label]
        diff = max_count-label_counts.loc[label]
        to_add = float(diff)/float(label_counts.loc[label])
        fraction_samples = to_add if to_add<1 else to_add-round(to_add)
        for i in range(0,int(round(to_add))):
            adding = int(round(to_add))
            new_df = new_df.append(label_df)
        if fraction_samples >0:
            samples = int(round(label_df.shape[0]*fraction_samples))
            frac_df = label_df.iloc[:samples,:].copy(deep=True)
            new_df = new_df.append(frac_df)
    return new_df

In [184]:
resampled_trn_cnn1 = resampled_trn_cnn1.iloc[np.random.permutation(len(resampled_trn_cnn1))]

In [185]:
xcnn1_sampled, ycnn1_sampled, testcnn1_sampled, vocab, max_len, n_classes = prepare_data(
        resampled_trn_cnn1['text'],
        resampled_trn_cnn1['label'],
        lines('xtest_obfuscated.txt'))

In [200]:
val_features, val_label, testcnn1_sampled1, vocab, max_len, n_classes = prepare_data(
        val_sample_cnn1['text'],
        val_sample_cnn1['label'],
        lines('xtest_obfuscated.txt'),
        max_len=452)

In [282]:
# Further changing few things in model. (Changing outputs to 512 instead of 256)
def char_cnn2(n_vocab, max_len, n_classes, weights_path=None):
    model = Sequential()
    model.add(Conv1D(512, 7, activation='relu', input_shape=(max_len, n_vocab)))
    model.add(MaxPooling1D(3))

    model.add(Conv1D(512, 7, activation='relu'))
    model.add(MaxPooling1D(3))

    model.add(Conv1D(512, 3, activation='relu'))
    model.add(Conv1D(512, 3, activation='relu'))
    model.add(Conv1D(512, 3, activation='relu'))
    model.add(Conv1D(512, 3, activation='relu'))
    model.add(MaxPooling1D(3))

    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(n_classes, activation='softmax'))

    if weights_path:
        model.load_weights(weights_path)

    return model

In [215]:
model2_cnn1 = compiled(char_cnn2(len(vocab), max_len, n_classes))

In [216]:
char_cnn1_fit3 = fit(model2_cnn1, xcnn1_sampled, ycnn1_sampled, epochs=5)

Train on 53381 samples, validate on 5932 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [219]:
char_cnn1_fit4 = fit(model2_cnn1, xcnn1_sampled, ycnn1_sampled, epochs=5)

Train on 53381 samples, validate on 5932 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [220]:
char_cnn1_fit5 = fit(model2_cnn1, xcnn1_sampled, ycnn1_sampled, epochs=5)

Train on 53381 samples, validate on 5932 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [221]:
model2_cnn1.save_weights('weight_cnn_oversample_1.h5')

# Intuition
Now the model seem to be less overfitting.

As the validation data is set aside, now training on the complete dataset and feeding the validation set in validation_data

In [223]:
char_cnn1_fit5 = fit(model2_cnn1, xcnn1_sampled, ycnn1_sampled, epochs=5,split=0,validation_data=(val_features,val_label))

Train on 59313 samples, validate on 3251 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [224]:
model2_cnn1.save_weights('weight_cnn_oversample_2.h5')

In [225]:
char_cnn1_fit6 = fit(model2_cnn1, xcnn1_sampled, ycnn1_sampled, epochs=5,split=0,validation_data=(val_features,val_label))

Train on 59313 samples, validate on 3251 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [226]:
model2_cnn1.save_weights('weight_cnn_oversample_3.h5')

In [245]:
char_cnn1_fit6 = fit(model2_cnn1, xcnn1_sampled, ycnn1_sampled, epochs=5,split=0,validation_data=(val_features,val_label))

Train on 59313 samples, validate on 3251 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [261]:
model2_cnn1.save_weights('weight_cnn_oversample_4.h5')

In [262]:
preds,idx = predict(model2_cnn1,testcnn1_sampled)

INFO (theano.gof.compilelock): Waiting for existing lock by process '2453' (I am process '2165')
INFO (theano.gof.compilelock): To manually release the lock, delete /home/ubuntu/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-2.7.12-64/lock_dir


In [265]:
len(idx)

3000

In [277]:
ytext_file = open('ytext.txt','w') 

In [278]:
for i in idx:
    ytext_file.write(str(i)+'\n')
ytext_file.close()

# Intuition

As the train accuracy seems good, validation acccuracy is less and the model seem to be still overfitting.

### From Above cells it is clear that labels with 4,5,2,11,9,0 have have very few examples.
So, building an LTSM model for them and generate the text later using generative model to oversample the data instead of duplicating the data as it was done earlier

For other labels, data is balanced. So for these labels, I will oversample using duplication method as done previously

# LSTM Model

As the validation accuracy doesn't increase using CNN, trying LSTM Model

In [250]:
def prepare_ltsm_data(raw_text):
    chars = sorted(list(set(raw_text)))
    char_to_int = dict((c, i) for i, c in enumerate(chars))
    # summarize the loaded data
    n_chars = len(raw_text)
    n_vocab = len(chars)
    print "Total Characters: ", n_chars
    print "Total Vocab: ", n_vocab
    # prepare the dataset of input to output pairs encoded as integers
    seq_length = 452
    dataX = []
    dataY = []
    for i in range(0, n_chars - seq_length, 1):
        seq_in = raw_text[i:i + seq_length]
        seq_out = raw_text[i + seq_length]
        dataX.append([char_to_int[char] for char in seq_in])
        dataY.append(char_to_int[seq_out])
    n_patterns = len(dataX)
    X = np.reshape(dataX, (n_patterns, seq_length, 1))
    # normalize
    X = X / float(n_vocab)
    # one hot encode the output variable
    y = to_categorical(dataY)
    return X,y

In [251]:
def lstm_model(X):
    model = Sequential()
    model.add(LSTM(452, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dropout(0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    return model

In [259]:
def compile_and_fit(model,label,X,y,epochs,batch_size=128,verbose=1):
    print('label: '+str(label))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    filepath="weights-ltsm"+str(label)+"-improvement-{epoch:02d}-{loss:.4f}.hdf5"
    model.fit(X,
              y,
              nb_epoch=epochs,
              batch_size=batch_size,
              callbacks=callbacks_list,
              verbose=verbose)
    