In [1]:
#import 
from keras import backend as K
from keras.utils import generic_utils

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.layers import Input,Dropout ,Dense, Conv1D, MaxPooling1D, Flatten
from keras.models import Model
from keras.utils import to_categorical
from keras.layers import Embedding
from keras.utils import plot_model

import pandas as pd
import numpy as np
#For plotting the evaluations
import matplotlib.pyplot as plt
#Pickle
import pickle

Using TensorFlow backend.


In [2]:
import re
###
# normalize contractions
# @param string string to normalize
# @return normalized string
###
def normalize_contractions(string):
    contractions = {
        "i'm": "i am",
        "you're": "you are",
        "it's": "it is",
        "we're": "we are",
        "we'll": "we will",
        "w/": "with",
        " w ": "with",
        ":\)": "",
        ":\(": ""
    }
    for k, v in contractions.items():
        string = string.replace(k, v)
    return string

###
# remove non ascii chars (especially to remove emojis)
# @param string string to perform removal on
# @return string without ascii chars
###
def remove_non_ascii(string): # especially emojis, attn with foreign languages (but they are not considered here)
    return re.sub(r'[^\x00-\x7F]+', '', string)

###
# remove rt (retweet mentions)
# @param string string to perform removal on
# @return string without rt
###
def remove_rt(string): # remove rt incl. user mention (RT to colon)
    return re.sub('RT \@+\w+:\s', '', string)

###
# remove user mentions except at @...
# @param string string to perform removal on
# @return string without user mention
###
def remove_usermentions(string): # WITHOUT "at @..." because it may refer to company name!!!
    return re.sub('/(?<!at )\@+\w+/g', '', string)

###
# remove url
# @param string string to perform removal on
# @return string without url
###
def remove_url(string):
    return re.sub('http\S+', '', string)   

###
# remove hashtags
# @param string string to perform removal on
# @return string without hashtags
###
def remove_hashtags(string):
    return re.sub('\#\S+ *', '', string)

# perform preprocessing steps by running related methods
# @return preprocessed dataframe
###
def preprocess_dataset(df):
    df_to_return = df
    for i in df_to_return.index:
        df_to_return.at[i, 1] = remove_hashtags(df_to_return.at[i, 1])
        df_to_return.at[i, 1] = normalize_contractions(df_to_return.at[i, 1])
        df_to_return.at[i, 1] = remove_non_ascii(df_to_return.at[i, 1])
        df_to_return.at[i, 1] = remove_rt(df_to_return.at[i, 1])
        df_to_return.at[i, 1] = remove_usermentions(df_to_return.at[i, 1])
        df_to_return.at[i, 1] = remove_url(df_to_return.at[i, 1])
    return df_to_return

In [3]:
# code to compute fmeasure by keras team github repository, cf. https://github.com/keras-team/keras/blob/1c630c3e3c8969b40a47d07b9f2edda50ec69720/keras/metrics.py

def binary_accuracy(y_true, y_pred):
    return K.mean(K.equal(y_true, K.round(y_pred)))


def categorical_accuracy(y_true, y_pred):
    return K.mean(K.equal(K.argmax(y_true, axis=-1),
                          K.argmax(y_pred, axis=-1)))


def sparse_categorical_accuracy(y_true, y_pred):
    return K.mean(K.equal(K.max(y_true, axis=-1),
                          K.cast(K.argmax(y_pred, axis=-1), K.floatx())))


def top_k_categorical_accuracy(y_true, y_pred, k=5):
    return K.mean(K.in_top_k(y_pred, K.argmax(y_true, axis=-1), k))


def mean_squared_error(y_true, y_pred):
    return K.mean(K.square(y_pred - y_true))


def mean_absolute_error(y_true, y_pred):
    return K.mean(K.abs(y_pred - y_true))


def mean_absolute_percentage_error(y_true, y_pred):
    diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true),
                                            K.epsilon(),
                                            None))
    return 100. * K.mean(diff)


def mean_squared_logarithmic_error(y_true, y_pred):
    first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.mean(K.square(first_log - second_log))


def hinge(y_true, y_pred):
    return K.mean(K.maximum(1. - y_true * y_pred, 0.))


def squared_hinge(y_true, y_pred):
    return K.mean(K.square(K.maximum(1. - y_true * y_pred, 0.)))


def categorical_crossentropy(y_true, y_pred):
    return K.mean(K.categorical_crossentropy(y_pred, y_true))


def sparse_categorical_crossentropy(y_true, y_pred):
    return K.mean(K.sparse_categorical_crossentropy(y_pred, y_true))


def binary_crossentropy(y_true, y_pred):
    return K.mean(K.binary_crossentropy(y_pred, y_true))


def kullback_leibler_divergence(y_true, y_pred):
    y_true = K.clip(y_true, K.epsilon(), 1)
    y_pred = K.clip(y_pred, K.epsilon(), 1)
    return K.mean(K.sum(y_true * K.log(y_true / y_pred), axis=-1))


def poisson(y_true, y_pred):
    return K.mean(y_pred - y_true * K.log(y_pred + K.epsilon()))


def cosine_proximity(y_true, y_pred):
    y_true = K.l2_normalize(y_true, axis=-1)
    y_pred = K.l2_normalize(y_pred, axis=-1)
    return -K.mean(y_true * y_pred)


def matthews_correlation(y_true, y_pred):
    """Matthews correlation metric.

    It is only computed as a batch-wise average, not globally.

    Computes the Matthews correlation coefficient measure for quality
    of binary classification problems.
    """
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())


def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def fbeta_score(y_true, y_pred, beta=1):
    """Computes the F score.

    The F score is the weighted harmonic mean of precision and recall.
    Here it is only computed as a batch-wise average, not globally.

    This is useful for multi-label classification, where input samples can be
    classified as sets of labels. By only using accuracy (precision) a model
    would achieve a perfect score by simply assigning every class to every
    input. In order to avoid this, a metric should penalize incorrect class
    assignments as well (recall). The F-beta score (ranged from 0.0 to 1.0)
    computes this, as a weighted mean of the proportion of correct class
    assignments vs. the proportion of incorrect class assignments.

    With beta = 1, this is equivalent to a F-measure. With beta < 1, assigning
    correct classes becomes more important, and with beta > 1 the metric is
    instead weighted towards penalizing incorrect class assignments.
    """
    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')

    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score


def fmeasure(y_true, y_pred):
    """Computes the f-measure, the harmonic mean of precision and recall.

    Here it is only computed as a batch-wise average, not globally.
    """
    return fbeta_score(y_true, y_pred, beta=1)


# aliases
mse = MSE = mean_squared_error
mae = MAE = mean_absolute_error
mape = MAPE = mean_absolute_percentage_error
msle = MSLE = mean_squared_logarithmic_error
cosine = cosine_proximity
fscore = f1score = fmeasure


def get(identifier):
    return get_from_module(identifier, globals(), 'metric')    

In [None]:
#Create Dataframe
df = pd.read_csv("extended_training_set.csv", sep='\t', header=None,error_bad_lines=False)
df = preprocess_dataset(df)
# split test set
texts = df[1]
labels = df[0]

MAX_NB_WORDS = 5000 # consider to 5,000 most occuring words in dataset
MAX_SEQUENCE_LENGTH = 1000 # truncate sequences to a maximum length of 1000 words
VALIDATION_SPLIT = 0.2

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

labels = to_categorical(labels)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]

nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

embeddings_index = {}

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
#Set embeding layer to trainable
embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
# 3 hidden layers with 128 neurons each
x = Conv1D(64, 5, activation='relu')(embedded_sequences)
x = Dropout(0.5)(x) #add Regularization
x = MaxPooling1D(5)(x)
x = Conv1D(64, 5, activation='relu')(x)
#x = Dropout(0.3)(x)
x = MaxPooling1D(5)(x)
x = Conv1D(64, 5, activation='relu')(x)
#x = Dropout(0.2)(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x) # 2 ... binary
#Instantiate model
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics= ['acc'])

# 20 epochs fit the data on the model
model_hist = model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=20, batch_size=128)

In [None]:
# plot evaluation metrics and epochs Accuracy
acc = model_hist.history['acc']
val_acc = model_hist.history['val_acc']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'ro', label='Training Accuracy.')
plt.plot(epochs, val_acc, 'r', label='Validation Accuracy.')



plt.title('Training and validation accuracy.')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
# plot evaluation metrics and epochs Loss
loss = model_hist.history['loss']
val_loss = model_hist.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, loss, 'bo', label='Training loss') # bo ... blue dot
plt.plot(epochs, val_loss, 'b', label='Validation loss')



plt.title('Training and validation loss.')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [20]:
# perform prediction on test set
df = pd.read_csv("expl_18k.csv", sep='\t', header=None)

texts = df[1]
labels = df[0]

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

sequences= tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(labels)
prediction = model.predict(data, batch_size=10)[0,1]
prediction1 = model.evaluate(data, labels, verbose=1)
#print(prediction)
print(prediction1) # loss, f1

Found 13074 unique tokens.
[0.9827842817531198, 0.48201209806110606]


In [7]:
#import Twitter
from twython import Twython
from twython import TwythonStreamer

from twitter import *
APP_KEY="Your keys"
APP_SECRET=""
OAUTH_TOKEN=""
OAUTH_TOKEN_SECRET=""

In [28]:
class MyStreamer(TwythonStreamer): 
    def on_success(self, data): 
        if 'text' in data:
            sequences= tokenizer.texts_to_sequences([data['text']])
            dat = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
            prediction = model.predict(dat, batch_size=64)[0,1]

            if prediction >0.8:
                print(data['text'])
                print(prediction)

def on_error(self, status_code, data):
    print(status_code)

In [None]:
stream = MyStreamer(APP_KEY, APP_SECRET,
                    OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
stream.statuses.filter(track='Hirring , Job , CareerArc ',language='en')

In [124]:
# save the model to disk
filename = 'x1.sav'
pickle.dump(model, open(filename, 'wb'))

In [4]:
# load the model from disk
filename = 'x1.sav'
loaded_model = pickle.load(open(filename, 'rb'))

import tensorflow as tf
global graph
graph = tf.get_default_graph()