# Functions

### Imports

In [2]:
# Necessary Imports
import csv
import os
import re
import itertools
import pandas as pd
import numpy as np
np.random.seed(1234)
from itertools import groupby
import contractions
import spacy
from collections import Counter
import chardet 
from os.path import join, exists, split
from gensim.models import word2vec
from gensim.models import Word2Vec,KeyedVectors
from copy import deepcopy
from multiprocessing import cpu_count

nlp = spacy.load('en')

### Text pre-processing

In [3]:
# Functions for text pre-processing

def unescapematch(matchobj):
    """
    Converts from hex to unicode: \u201c -> '
    """
    escapesequence = matchobj.group(0)
    digits = escapesequence[2:6]
    ordinal = int(digits, 16)
    char = chr(ordinal)
    return char

def replace_url_phone(text):
    """
    Accepts a text string and replaces:
    1) emails with emailid
    2) urls with url
    3) phone numbers with phonenumber
    """
    email_regex = ("([A-Za-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`""{|}~-]+)*(@)(?:[A-Za-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|""\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)")
    url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    phone_regex =  ("([+]\d{12}|[+]?\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)[-\.\s]??\d{3}[\-\.\s]??\d{4}|[+]\d{1,2}[-\.\s]??\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4})")
    text = re.sub(email_regex, 'emailid',text)
    text = re.sub(url_regex, 'url',text)
    text = re.sub(phone_regex, 'phonenumber', text)
    return text
 
def clean_string(string):
    """
    Cleans string
    1) replaces e.g: \u201c -> '
    2) replaces contractions lile I'm -> I am
    3) replaces emailids, urls and phone numbers
    4) places a space between words and punctuation
    5) replaces symbols with words like $ -> dollar
    
    """
    string = re.sub(r'(\\u[0-9A-Fa-f]{4})', unescapematch, string)
    # remove remaining hexcodes
    string = re.sub(r'[^\x00-\x7f]',r'', string)
    string = contractions.fix(string)
    string = replace_url_phone(string)
    # add space between punctuation and text
    string = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", string)
    string = re.sub('\$', " dollar ", string)
    string = re.sub('\%', " percent ", string)
    string = re.sub('\&', " and ", string)
    string = re.sub('\"'," quote ", string)
    string = string.replace("\\","")
    # remove multiple instances of punctuation
    re.sub(r'(\W)(?=\1)', '', string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

def check_encoding(filepath): 
    """
    Check encoding of a file 
    """
    rawdata = open(filepath, 'rb').read()
    result = chardet.detect(rawdata)
    charenc = result['encoding']
    return charenc

def replace_entity(sentence):
    """
    Replaces specific entities in text
    Ex: Harry lives in Pennsylvania -> PERSON lives in GPE 
    
    """
    doc = nlp(sentence)
    sentence_ = []
    for ent in doc.ents:
        ent.merge()
    for token in doc:
        
        if token.ent_type_ in ['ORG','GPE','LOC','NORP','CARDINAL','FACILITY','MONEY','PERSON','DATE','TIME',\
                      'PERCENT','QUANTITY','ORDINAL']:
            sentence_.append(token.ent_type_+'_ent')
            
        else:
            sentence_.append(token.orth_)
        # combining detection of compound entities like State-College (ORG ORG ORG) -> ORG
        sentence_set = [x[0] for x in groupby(sentence_)]
    return sentence_set

### Load data and prepare for embedding

In [4]:
# Functions to load data and produce embeddings
def load_data_and_labels(folderpath,filename):
    """
    Loads data, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    with open(os.path.join(folderpath, filename), 'r',encoding="utf-8") as datafile:
         reader = csv.reader(datafile,delimiter='\t')
         first_iteration = 1
         x_text = []
         y = []
         for row in reader:
             if first_iteration==1:
                 print('The columns in the file are:\n ',row)
                 #text_index = row.index('CommentText')
                 text_index = row.index('SENTENCES')
                 class_index = row.index('binary_labels')
                 first_iteration =0
             else:
                #text = clean_string(row[text_index])
                comment = eval(row[text_index])
                labels = eval(row[class_index])
                if len(comment)>0:
                   for i in range(len(comment)):
                       sentence = clean_string(comment[i])
                       sentence = replace_entity(sentence)
                       sentence = [word.lower() for word in sentence]
                       label = labels[i][0]
                       x_text.append(sentence)
                       y.append(label)
         return x_text,y
         
        
def pad_sequences(sentences, pad_len, padding_word="<PAD/>"):
    """
    Pads the sentences to make the length equal to the sentence with maximum number of words
    """
    if pad_len is not None:
        sequence_length = pad_len
    else:
        sequence_length = max(len(x) for x in sentences)
    
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        if len(sentence)<sequence_length:
           num_padding = sequence_length - len(sentence)
           new_sentence = sentence + [padding_word] * num_padding
        else:
           num_padding = 0
           new_sentence = [x for x in sentence[0:pad_len]]
        padded_sentences.append(new_sentence)
    return padded_sentences

def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

def load_data(folderpath,filename):
    """
    Loads the preprocessed data.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences, labels = load_data_and_labels(folderpath,filename)
    sentences_padded = pad_sequences(sentences, padding_word="<PAD/>", pad_len=30)
    vocab_size,vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv, vocab_size] 

def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentencs and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]

def load_trainable_dataset(folderpath,filename):
    
    x_text, y, vocabulary, vocabulary_inv_list, vocab_size = load_data(folderpath,filename)
    
    vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
    
    #y = y.argmax(axis=1)

    # Shuffle data
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    
    x_text = x_text[shuffle_indices]
    y = y[shuffle_indices]
    
    train_len = int(len(x_text) * 0.8)
    
    x_train = x_text[:train_len]
    y_train = y[:train_len]
    x_test = x_text[train_len:]
    y_test = y[train_len:]
    
    return x_train, y_train, x_test, y_test, vocabulary, vocabulary_inv


### Custom Keras F1, precision and recall metric with callback

In [None]:
# Custom Keras metric
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from keras.callbacks import Callback
class Metrics(Callback):
    def on_train_begin(self, logs={}):
         self.val_f1s = []
         self.val_recalls = []
         self.val_precisions = []
 
    def on_epoch_end(self, epoch, logs={}):
         val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
         val_targ = self.validation_data[1]
         _val_f1 = f1_score(val_targ, val_predict)
         _val_recall = recall_score(val_targ, val_predict)
         _val_precision = precision_score(val_targ, val_predict)
         self.val_f1s.append(_val_f1)
         self.val_recalls.append(_val_recall)
         self.val_precisions.append(_val_precision)
         print('— val_f1: %f — val_precision: %f — val_recall %f' %(_val_f1, _val_precision, _val_recall))
         return 
metrics = Metrics()

# Creating embeddings: Google news trained and trained from scratch

In [126]:
# Create embeddings

print('Loading data and pretrained word2vec model....')
sentences,labels = load_data_and_labels('/Users/pxu3/Desktop/Spring 2019/Research/Data/','Binarized_Dataset.txt')
google_wv = KeyedVectors.load_word2vec_format('/Users/pxu3/Desktop/Deep Learning/1Bword2vec/GoogleNews-vectors-negative300.bin', binary=True)
google_wv.save("/Users/pxu3/Desktop/Spring 2019/Research/Code/pretrained_googlewv")

print('Padding sentences..............')
padded_sentences = pad_sequences(sentences, padding_word="<PAD/>", pad_len=30)

print('Online training the google pretrained word2vec model')
model = Word2Vec(size=300, min_count=1, iter=10)
model.build_vocab(padded_sentences)
training_examples_count = model.corpus_count
# below line will make it 1, so saving it before
model.build_vocab([list(google_wv.vocab.keys())], update=True)
model.intersect_word2vec_format("/Users/pxu3/Desktop/Deep Learning/1Bword2vec/GoogleNews-vectors-negative300.bin",binary=True, lockf=1.0)
model.train(padded_sentences,total_examples=training_examples_count, epochs=model.epochs)
model.save("/Users/pxu3/Desktop/Spring 2019/Research/Code/updated_pretrained_googlewv.bin")

print('Re-trained word2vec model')
model1 = Word2Vec(size=300, min_count=1, iter=10)
model1.build_vocab(padded_sentences)
model1.train(padded_sentences,total_examples=training_examples_count, epochs=model.epochs)
model1.save("/Users/pxu3/Desktop/Spring 2019/Research/Code/trained_wv.bin")

Loading data and pretrained word2vec model....
The columns in the file are:
  ['CommentID', 'ArticleId', 'NewsSource', 'NewsTitle', 'ParentId', 'CommentID', 'CommentText', 'TimeStampId', 'Commentor', 'CommentorInfo', 'SENTENCES', 'SVO', 'FACILITY', 'LOC', 'GPE', 'NORP', 'ORG', 'Category', 'Category_sentence', 'binary_labels']
Padding sentences..............
Online training the google pretrained word2vec model
Re-trained word2vec model


In [128]:
print(model.wv.most_similar('loc_ent'),model1.wv.most_similar('loc_ent'))

[('gpe_ent', 0.9062396883964539), ('isis', 0.833358883857727), ('land', 0.8105503916740417), ('org_ent', 0.7978490591049194), ('military', 0.7964844703674316), ('water', 0.7870767116546631), ('countries', 0.7859431505203247), ('west', 0.7794488668441772), ('country', 0.7779181003570557), ('world', 0.7766268253326416)] [('gpe_ent', 0.8147958517074585), ('region', 0.7576183080673218), ('west', 0.752234935760498), ('south', 0.7443078756332397), ('border', 0.7019321322441101), ('waters', 0.7007555961608887), ('reefs', 0.6848381757736206), ('north', 0.6750556826591492), ('land', 0.6741229891777039), ('drought', 0.6426279544830322)]


# Load data

In [5]:
x_train, y_train, x_test, y_test, vocabulary, vocabulary_inv = load_trainable_dataset('/Users/pxu3/Desktop/Spring 2019/Research/Data/','Binarized_Dataset.txt')

The columns in the file are:
  ['CommentID', 'ArticleId', 'NewsSource', 'NewsTitle', 'ParentId', 'CommentID', 'CommentText', 'TimeStampId', 'Commentor', 'CommentorInfo', 'SENTENCES', 'SVO', 'FACILITY', 'LOC', 'GPE', 'NORP', 'ORG', 'Category', 'Category_sentence', 'binary_labels']


# Models

In [10]:
import tensorflow as tf
import numpy as np

# Parameters
# ==================================================

# Data loading params
#tf.app.flags.DEFINE_string('f', '', 'kernel')
#tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")

# Model Hyperparameters
#tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of character embedding (default: 300)")
#tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
#tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
#tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
#tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")

# Training parameters
#tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
#tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")
#tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
#tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
#tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")

# Misc Parameters
#tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
#tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

#FLAGS = tf.flags.FLAGS


class TextCNN(object):
    """
    A CNN for text classification.
    Uses a convolutional, max-pooling and softmax layer.
    """
    def __init__(
      self, model_path, sequence_length, num_classes,
      embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):

        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
        self.model_path = model_path

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)

        
        # Load the word-vector model
        
        model = Word2Vec.load(self.model_path)
        wvecs = model.wv
        embedding_size = wvecs.vector_size
        vocab_size = len(wvecs.vocab)

        # Create the embedding matrix where words are indexed alphabetically
        embedding_mat = np.zeros(shape=(vocab_size, embedding_size), dtype='int32')
        for idx, word in enumerate(sorted(wvecs.vocab)):
            embedding_mat[idx] = wvecs.get_vector(word)

        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            #self.W = tf.get_variable("embedding", [vocab_size, embedding_size],\
            #                               initializer=tf.constant_initializer(embedding_mat),trainable=False)
            self.W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),name="W")
            self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # Calculate mean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")


In [11]:
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement,\
          log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(sequence_length=x_train.shape[1],
                num_classes=2,
                model_path = "/Users/pxu3/Desktop/Spring 2019/Research/Code/updated_pretrained_googlewv.bin",
                embedding_size=FLAGS.embedding_dim,
                filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
                num_filters=FLAGS.num_filters,
                l2_reg_lambda=FLAGS.l2_reg_lambda) 

KeyboardInterrupt: 