In [82]:
import argparse
import pickle
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
from collections import Counter
import os
from sklearn import metrics
from tensorflow.contrib import learn
from tflearn.data_utils import to_categorical, pad_sequences
from scipy import stats
import tflearn
from sklearn.manifold import TSNE
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression
import tensorflow as tf
os.environ['KERAS_BACKEND']='theano'
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model,Sequential
from keras import initializers, optimizers

In [83]:
from AttentionLayer import AttLayer

#Here we add layers to our model    
def get_blstm_atten(inp_dim, vocab_size, num_classes, learn_rate):
    model = Sequential()
    model.add(Embedding(vocab_size, EMBED_SIZE, input_length=inp_dim))
    model.add(Dropout(0.25))
    model.add(Bidirectional(LSTM(EMBED_SIZE, return_sequences=True)))
    model.add(AttLayer())
    model.add(Dropout(0.50))
    model.add(Dense(num_classes, activation='softmax'))
    adam = optimizers.Adam(lr=learn_rate, beta_1=0.9, beta_2=0.999)
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    model.summary()
    return model

In [84]:
# Loads the already cleaned and prepared dataset from the files
def load_cleaned_data(filename):
    print("Loading data from file: " + filename)
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = [] 
    for i in range(len(data)):
        x_text.append(data[i]['text'])
        labels.append(data[i]['label'])
    return x_text,labels

def load_csv_data(filename):
    print("Loading data from file: " + filename)
    data = pd.read_csv(filename,index_col=0)
    x_text = [str(x) for x in data['text']]
    labels = data['label']
    return x_text,labels

# Gets the filenames for a particular kind of dataset
def get_filename(dataset):
    global NUM_CLASSES
    if(dataset=="twitter"):
        NUM_CLASSES = 3
        filename = "data/twitter_data.pkl"
    elif(dataset=="formspring"):
        NUM_CLASSES = 2
        filename = "data/formspring_data.pkl"
    elif(dataset=="wiki"):
        NUM_CLASSES = 2
        filename = "data/wiki_data.pkl"
    elif(dataset=="sentiment140"):
        NUM_CLASSES = 2
        filename = "data/sentiment140.csv" 
    return filename

In [85]:
def evaluate_trained_model(model, testX, testY):
    predictions = model.predict(testX)
    y_pred  = np.argmax(predictions, 1)
    y_true = np.argmax(testY, 1)
    precisionValue = metrics.precision_score(y_true, y_pred, average=None)
    recallValue = metrics.recall_score(y_true, y_pred, average=None)
    f1_score = metrics.f1_score(y_true, y_pred, average=None)
    print("Precision: " + str(precisionValue) + "\n")
    print("Recall: " + str(recallValue) + "\n")
    print("f1_score: " + str(f1_score) + "\n")
    return precisionValue, recallValue, f1_score

In [86]:
def get_train_test_split(data, x_text, labels):
    X_train, X_test, Y_train, Y_test = train_test_split( x_text, labels, random_state=42, test_size=0.10)
    
    post_length = np.array([len(x.split(" ")) for x in x_text])
    if(data != "twitter"):
        max_document_length = int(np.percentile(post_length, 95))
    else:
        max_document_length = max(post_length)
    print("Document length : " + str(max_document_length))
    
    # Maps documents to vectors of words ids of length max_document_length by padding or clipping numbers
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, MAX_FEATURES)
    vocab_processor = vocab_processor.fit(x_text)

    trainX = np.array(list(vocab_processor.transform(X_train)))
    testX = np.array(list(vocab_processor.transform(X_test)))
    
    trainY = np.asarray(Y_train)
    testY = np.asarray(Y_test)
    
    # Just make each vector into equal length of max_document_length
    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    
    # Convert numbers to binary matrix
    trainY = to_categorical(trainY, nb_classes=NUM_CLASSES)
    testY = to_categorical(testY, nb_classes=NUM_CLASSES)
    
    data_dict = {
        "data": data,
        "trainX" : trainX,
        "trainY" : trainY,
        "testX" : testX,
        "testY" : testY,
        "vocab_processor" : vocab_processor
    }
    
    return data_dict

In [87]:
def return_data(data_dict):
    return data_dict["data"], data_dict["trainX"], data_dict["trainY"], data_dict["testX"], data_dict["testY"], data_dict["vocab_processor"]

In [88]:
def shuffle_weights(model, weights=None):
    if weights is None:
        weights = model.get_weights()
    weights = [np.random.permutation(w.flat).reshape(w.shape) for w in weights]
    model.set_weights(weights)

In [89]:
def train_evaluate(data_dict, dump_embeddings=False):

    data, trainX, trainY, testX, testY, vocab_processor = return_data(data_dict)
    
    vocab_size = len(vocab_processor.vocabulary_)
    print("Vocabulary Size: {:d}".format(vocab_size))
    vocab = vocab_processor.vocabulary_._mapping
    
    print("Running Model...")
    model = get_blstm_atten(trainX.shape[1], vocab_size, NUM_CLASSES, LEARN_RATE)

    initial_weights = model.get_weights()
    shuffle_weights(model, initial_weights)
    
    model.fit(trainX, trainY, epochs=EPOCHS, shuffle=True, batch_size=BATCH_SIZE, verbose=1)
            
    if (dump_embeddings==True):
        embed = model.layers[0].get_weights()[0]
    
        embed_filename = output_folder_name + data + ".pkl"
        embed.dump(embed_filename)
        
        vbp_filename = output_folder_name + data + "_vbp.pkl"
        vocab_processor.save(vbp_filename)
            
        model_filename = output_folder_name + data + ".model"
        model.save(model_filename)
    
    return  evaluate_trained_model(model, testX, testY)

In [90]:
from collections import Counter
# Does Over Sampling and removing punctuation from the text
def get_data(data):
    if(data == "sentiment140"):
        x_text2, labels2 = load_csv_data(get_filename(data)) 
        x_text, labels = load_cleaned_data(get_filename("wiki"))
    else:
        x_text, labels = load_cleaned_data(get_filename(data)) 
    
    if(data=="twitter"):
        NUM_CLASSES = 3
        dict1 = {'racism':2,'sexism':1,'none':0}
        labels = [dict1[b] for b in labels]
        
        racism = [i for i in range(len(labels)) if labels[i]==2]
        sexism = [i for i in range(len(labels)) if labels[i]==1]
        x_text = x_text + [x_text[x] for x in racism]*(OVERSAMPLING_RATE-1)+ [x_text[x] for x in sexism]*(OVERSAMPLING_RATE-1)
        labels = labels + [2 for i in range(len(racism))]*(OVERSAMPLING_RATE-1) + [1 for i in range(len(sexism))]*(OVERSAMPLING_RATE-1)
    
    else:  
        NUM_CLASSES = 2
        bully = [i for i in range(len(labels)) if labels[i]==1]
        x_text = x_text + [x_text[x] for x in bully]*(OVERSAMPLING_RATE-1)
        labels = list(labels) + [1 for i in range(len(bully))]*(OVERSAMPLING_RATE-1)
        
    if(data == "sentiment140"):
        x_text.extend(x_text2)
        labels.extend(labels2)

    print("Counter after oversampling")
    print(Counter(labels))
        
    return x_text, labels

In [91]:
def train_model(data):    
    x_text, labels = get_data(data)
    data_dict = get_train_test_split(data,  x_text, labels)
    train_evaluate(data_dict, True)


In [92]:
EPOCHS = 10
BATCH_SIZE = 128
MAX_FEATURES = 2
NUM_CLASSES = None
DROPOUT = 0.25
LEARN_RATE = 0.01
EMBED_SIZE = 50
# param OVERSAMPLING_RATE is the multiple by which bullying or sexist/racist data is increased
OVERSAMPLING_RATE = 3
output_folder_name = "results/"
#Selects the dataset to use to train
#Can select from [formspring, wiki, twitter, sentiment104]
DATA = "formspring"

train_model(DATA)

Loading data from file: data/formspring_data.pkl
Counter after oversampling
Counter({0: 11997, 1: 2328})
Document length : 62
Vocabulary Size: 7190
Running Model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 62, 50)            359500    
_________________________________________________________________
dropout_11 (Dropout)         (None, 62, 50)            0         
_________________________________________________________________
bidirectional_6 (Bidirection (None, 62, 100)           40400     
_________________________________________________________________
att_layer_6 (AttLayer)       (None, 100)               100       
_________________________________________________________________
dropout_12 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)     

KeyboardInterrupt: 