# init

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!pip install nltk
!pip install contractions
!pip install textsearch
!pip install tensorflow-hub

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/f5/2a/ba0a3812e2a1de2cc4ee0ded0bdb750a7cef1631c13c78a4fc4ab042adec/contractions-0.0.21-py2.py3-none-any.whl
Installing collected packages: contractions
Successfully installed contractions-0.0.21
Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting pyahocorasick (from textsearch)
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 4.0MB/s 
[?25hCollecting Unidecode (from textsearch)
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 47.1MB/s 
[?25h

# data.py

In [0]:
#for data cleanup /load
import re
import contractions
import pandas as pd
import unicodedata
from joblib import dump, load
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

num_words = 8000 
max_tokens = 30
pad = 'post'
path = '/content/drive/My Drive/data/'


def remove_white_space(text):
    return text.strip().strip('\t\n')

def remove_special_character(text):
    return re.sub('[^A-Za-z0-9\s]+', '', text)

def data_clean(train_data, filename):
  # simple text clean up
  train_data['question_text'] = train_data['question_text']\
  .str.normalize('NFKD').apply(contractions.fix).apply(remove_white_space)\
  .str.lower().apply(remove_special_character)
  train_data['word_count'] = train_data['question_text'].apply(lambda x: len(str(x).split()))
  #remove empty text
  train_data = train_data.loc[(train_data.word_count > 0)]
  train_data= train_data.reset_index()
  dump(train_data, path+filename)
  return train_data

def str_clean(question):
  return remove_special_character(remove_white_space(contractions.fix(unicodedata.normalize('NFKD', question))).lower())

def threeway_split(X, y):
    X_train, X_hold, y_train, y_hold  = train_test_split(X, y, 
                                                     train_size = 0.8, test_size = 0.2, 
                                                     random_state = 42, stratify = y)
    X_dev, X_test, y_dev, y_test  = train_test_split(X_hold, y_hold, 
                                                     train_size = 0.5, test_size = 0.5,  
                                                     random_state = 42, stratify = y_hold)
    del X_hold, y_hold
    return X_train, X_dev, X_test, y_train, y_dev, y_test

def generate(filename):
  train_data = pd.read_csv(path+filename)
  train_data = data_clean(train_data, 'train_ref.pkl')
  train_data_s = pd.concat([train_data.loc[(train_data['target'] == 0) & (train_data['question_text'].str.len() > 10)].sample(n=90000, random_state=42),\
                            train_data.loc[(train_data['target'] == 1) & (train_data['question_text'].str.len() > 10)].sample(n=80000, random_state=42)], ignore_index=True)
  train_data_s = train_data_s.sample(frac=1).reset_index(drop=True)
  
  X_train, X_dev, X_test, y_train, y_dev, y_test = threeway_split(train_data['question_text'], train_data['target'])
  X_train_s, X_dev_s, X_test_s, y_train_s, y_dev_s, y_test_s = threeway_split(train_data_s['question_text'], train_data_s['target'])
  print('Training data set (regular): ' + str(len(train_data)))
  print('Training data set (small): ' + str(len(train_data_s)))
  print(train_data_s.head())
  
  #for model 1
  dump(X_train, path+'X_train_ref.pkl')
  dump(y_train, path+'y_train_ref.pkl')
  dump(X_dev, path+'X_dev_ref.pkl')
  dump(y_dev, path+'y_dev_ref.pkl')
  dump(X_test, path+'X_test_ref.pkl')
  dump(y_test, path+'y_test_ref.pkl')

  #for model 2
  tokenizer = Tokenizer(num_words=num_words, lower=False, char_level=False)
  tokenizer.fit_on_texts(train_data['question_text'])
  # need tokenizer and padding for predict
  X_train_token  = tokenizer.texts_to_sequences(X_train)
  X_dev_token  = tokenizer.texts_to_sequences(X_dev)
  X_test_token  = tokenizer.texts_to_sequences(X_test)
  X_train_token = pad_sequences(X_train_token, maxlen=max_tokens, padding=pad, truncating=pad).tolist()
  X_dev_token = pad_sequences(X_dev_token, maxlen=max_tokens, padding=pad, truncating=pad).tolist()
  X_test_token = pad_sequences(X_test_token, maxlen=max_tokens, padding=pad, truncating=pad).tolist()
  dump(tokenizer, path+'tokenizer_ref.pkl')
  dump(X_train_token, path+'X_train_token_ref.pkl')
  dump(X_dev_token, path+'X_dev_token_ref.pkl')
  dump(X_test_token, path+'X_test_token_ref.pkl')

  #for model 3
  dump(X_train_s, path+'X_train_s_ref.pkl')
  dump(y_train_s, path+'y_train_s_ref.pkl')
  dump(X_dev_s, path+'X_dev_s_ref.pkl')
  dump(y_dev_s, path+'y_dev_s_ref.pkl')
  dump(X_test_s, path+'X_test_s_ref.pkl')
  dump(y_test_s, path+'y_test_s_ref.pkl')
  print("generate complete")

def test():
  if len(load(path+'X_train_ref.pkl')) != len(load(path+'y_train_ref.pkl')):
    return False
  if len(load(path+'X_dev_ref.pkl')) != len(load(path+'y_dev_ref.pkl')):
    return False
  if len(load(path+'X_test_ref.pkl')) != len(load(path+'y_test_ref.pkl')):
    return False
  if len(load(path+'X_train_token_ref.pkl')) != len(load(path+'y_train_ref.pkl')):
    return False
  if len(load(path+'X_dev_token_ref.pkl')) != len(load(path+'y_dev_ref.pkl')):
    return False
  if len(load(path+'X_test_token_ref.pkl')) != len(load(path+'y_test_ref.pkl')):
    return False  
  if len(load(path+'X_train_s_ref.pkl')) != len(load(path+'y_train_s_ref.pkl')):
    return False
  if len(load(path+'X_dev_s_ref.pkl')) != len(load(path+'y_dev_s_ref.pkl')):
    return False
  if len(load(path+'X_test_s_ref.pkl')) != len(load(path+'y_test_s_ref.pkl')):
    return False  
  if len(load(path+'X_train_ref.pkl')) < 1000000:
    return False  
  if len(load(path+'X_train_s_ref.pkl')) < 100000:
    return False  
  print("test complete")
  return True

generate('train.csv')
test()

Training data set (regular): 1306116
Training data set (small): 170000
     index                   qid  ... target  word_count
0  1145688  e07b0fb24d12b851f1c2  ...      1           9
1   318173  3e5b754e8fec9128cb0e  ...      0           8
2  1235644  f2269597fb05a3682dab  ...      0          13
3   617819  78fd3a6e150598a4d63b  ...      0           8
4  1221373  ef61c75299dab48f2b93  ...      0           7

[5 rows x 5 columns]
generate complete
test complete


True

# lg train & predict.py

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
import numpy as np
from joblib import dump, load

num_words = 8000 
path = '/content/drive/My Drive/data/'

def lg_train(X_train, y_train):
  logreg = Pipeline([('vect', CountVectorizer(max_features=num_words, min_df=2, lowercase=False)),
                   ('tfidf', TfidfTransformer()),
                   ('clf', LogisticRegressionCV(class_weight='balanced', cv=5, scoring='roc_auc', max_iter=1000,n_jobs=-1)),
                  ])
  logreg.fit(X_train, y_train)
  dump(logreg, path+'logreg_ref.pkl')
  print('complete the training')
  return logreg

def lg_predict(X_predict):
  logreg = load(path +'logreg_ref.pkl')
  y_pred = logreg.predict(X_predict)
  print('return prediction')
  return y_pred
  
def lg_test(X_dev, y_dev):
  logreg = load(path +'logreg_ref.pkl')
  y_pred = logreg.predict(X_dev)
  return f1_score(y_dev, y_pred, average='weighted') > 0.90


lg_train(load(path +'X_train_ref.pkl'),load(path +'y_train_ref.pkl'))
test = lg_test(load(path +'X_dev_ref.pkl'),load(path +'y_dev_ref.pkl'))
predict = lg_predict(load(path +'X_test_ref.pkl'))
print(test)
print(np.unique(predict, return_counts=True))

#print(lg_predict([str_clean("Why did ancient Roman toilets have large openings in the front")]))

# rnn train & predict.py

In [0]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.models import Sequential, save_model, load_model
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from sklearn.metrics import f1_score
from joblib import dump, load
import numpy as np


num_words = 8000 
embedding_size = 300
max_tokens = 30
pad = 'post'
path = '/content/drive/My Drive/data/'

def load_para(word_index):
    EMBEDDING_FILE = path+'embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.0053247833,0.49346462
    embed_size = all_embs.shape[1]

    nb_words = min(num_words, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= num_words: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

def rnn_train():
    tokenizer= load(path+'tokenizer_ref.pkl')
    X_train_token = load(path +'X_train_token_project.sav')
    X_dev_token = load(path +'X_dev_token_project.sav')
    y_train = load(path +'y_train_project.sav')
    y_dev = load(path +'y_dev_project.sav')
    paragram_embeddings = load_para(tokenizer.word_index)
    
    model = Sequential()
    optimizer = Adam(lr=1e-3)
    model.add(Embedding(weights=[paragram_embeddings], trainable=False, input_dim=num_words, output_dim=embedding_size, input_length=max_tokens))
    model.add(GRU(units=32, return_sequences=True))
    model.add(GRU(units=16, dropout=0.5, return_sequences=True))
    model.add(GRU(units=8, return_sequences=True))
    model.add(GRU(units=4))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['AUC', 'accuracy'])
    model.summary()
    history = model.fit(np.array(X_train_token), y_train, validation_data=(np.array(X_dev_token),y_dev), epochs=4, batch_size=500)
    save_model(model,path+'rnn_model.h5')
    print('train complete')

def rnn_predict(X_token):
    model = load_model(path+'rnn_model.h5')
    predicted = model.predict(np.array(X_token))
    predicted = predicted.T[0]
    cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in predicted])
    print('return prediction')
    return cls_pred

def rnn_test(X_token, y_value):
    model = load_model(path+'rnn_model.h5')
    predicted = model.predict(np.array(X_token))
    predicted = predicted.T[0]
    cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in predicted])  
    return f1_score(y_value, cls_pred, average='weighted') > 0.90

rnn_train()
test = rnn_test(load(path +'X_dev_token_ref.pkl'),load(path +'y_dev_ref.pkl'))
predict = rnn_predict(load(path+'X_test_token_ref.pkl'))
print(test)
print(np.unique(predict, return_counts=True))

tokenizer= load(path+'tokenizer_ref.pkl')
X_token = tokenizer.texts_to_sequences(["Why did ancient Roman toilets have large openings in the front"])
X_token = pad_sequences(X_token, maxlen=max_tokens, padding=pad, truncating=pad).tolist()
print(rnn_predict(X_token))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
True
return prediction
[0.]


# lm train & predict.py

In [0]:
import tensorflow_hub as hub
import tensorflow as tf
from joblib import dump, load

TOTAL_STEPS = 4000
STEP_SIZE = 500
path = '/content/drive/My Drive/data/'


#def lm_train():
#def lm_predict(X_token):
#def lm_test(X_token, y_value):
  
X_train_s = load(path +'X_train_s_ref.pkl')
X_test_s = load(path +'X_test_s_ref.pkl')
X_dev_s = load(path +'X_dev_s_ref.pkl')
y_train_s = load(path +'y_train_s_ref.pkl')
y_test_s = load(path +'y_test_s_ref.pkl')
y_dev_s = load(path +'y_dev_s_ref.pkl')

X_dev = load(path +'X_dev_ref.pkl')
y_dev = load(path +'y_dev_ref.pkl')


# Retain the 2 most recent checkpoints.
my_checkpointing_config = tf.estimator.RunConfig(
    keep_checkpoint_max = 2, 
)
# Training input on the whole training set with no limit on training epochs.
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'sentence': X_train_s.values}, y_train_s.values, 
    batch_size=256, num_epochs=None, shuffle=True)
# Prediction on the whole training set.
predict_train_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'sentence': X_train_s.values}, y_train_s.values, shuffle=False)
# Prediction on the whole validation set.
predict_val_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'sentence': X_dev_s.values},  y_dev_s.values, shuffle=False)
# Prediction on the test set.
predict_test_input_fn = tf.estimator.inputs.numpy_input_fn(
    {'sentence': X_dev.values}, y_dev.values, shuffle=False)


def get_predictions(estimator, input_fn):
    return [x["class_ids"][0] for x in estimator.predict(input_fn=input_fn)]

def train_and_evaluate_with_sentence_encoder(hub_module, train_module=False, path=''):
    embedding_feature = hub.text_embedding_column(
        key='sentence', module_spec=hub_module, trainable=train_module)
  
    print('Training with', hub_module)
    print('Trainable is:', train_module)
  
    dnn = tf.estimator.DNNClassifier(
        hidden_units=[512, 128],
        feature_columns=[embedding_feature],
        n_classes=2,
        activation_fn=tf.nn.relu,
        dropout=0.1,
        optimizer=tf.train.AdagradOptimizer(learning_rate=0.005),
        model_dir=path,
        config=my_checkpointing_config)

    for step in range(0, TOTAL_STEPS+1, STEP_SIZE):
        print('Training for step =', step)
        dnn.train(input_fn=train_input_fn, steps=STEP_SIZE)
        print('Eval Metrics (Train):', dnn.evaluate(input_fn=predict_train_input_fn))
        print('Eval Metrics (Validation):', dnn.evaluate(input_fn=predict_val_input_fn))
        print('\n')
    
    predictions_train = get_predictions(estimator=dnn, input_fn=predict_train_input_fn)
    predictions_dev = get_predictions(estimator=dnn, input_fn=predict_test_input_fn)
    return predictions_train, predictions_dev, dnn
    
  
tf.logging.set_verbosity(tf.logging.ERROR)

predictions_test, predictions_dev, dnn = train_and_evaluate_with_sentence_encoder(
    "https://tfhub.dev/google/universal-sentence-encoder/2", train_module=True, path=path+'storage/models/refact/')

#report(y_dev.values, predictions_dev)
#plot_roc(y_dev.values, predictions_dev)
#store_matrix("use-512-with-training (dev)", y_dev.values, predictions_dev)
#store_matrix("use-512-with-training (train)", y_train_s.values, predictions_train)



import numpy 
x = ['documents required at the time of interview in sbi']
input_fn = tf.estimator.inputs.numpy_input_fn(
    {'sentence': numpy.array(x)},shuffle=False)
print(get_predictions(estimator=dnn, input_fn=input_fn))
x = ['white felame is the best human']
input_fn = tf.estimator.inputs.numpy_input_fn(
    {'sentence': numpy.array(x)},shuffle=False)
print(get_predictions(estimator=dnn, input_fn=input_fn))


Training with https://tfhub.dev/google/universal-sentence-encoder/2
Trainable is: True
Training for step = 0
Eval Metrics (Train): {'accuracy': 0.954625, 'accuracy_baseline': 0.5294118, 'auc': 0.98519826, 'auc_precision_recall': 0.97951037, 'average_loss': 0.14811938, 'label/mean': 0.47058824, 'loss': 18.950361, 'precision': 0.94910145, 'prediction/mean': 0.4588833, 'recall': 0.95478123, 'global_step': 500}
Eval Metrics (Validation): {'accuracy': 0.90982354, 'accuracy_baseline': 0.5294118, 'auc': 0.9648245, 'auc_precision_recall': 0.9545516, 'average_loss': 0.23610932, 'label/mean': 0.47058824, 'loss': 30.179386, 'precision': 0.903935, 'prediction/mean': 0.45778364, 'recall': 0.9045, 'global_step': 500}


Training for step = 500
Eval Metrics (Train): {'accuracy': 0.9828456, 'accuracy_baseline': 0.5294118, 'auc': 0.99576074, 'auc_precision_recall': 0.99394524, 'average_loss': 0.06350087, 'label/mean': 0.47058824, 'loss': 8.124288, 'precision': 0.97501194, 'prediction/mean': 0.4706808, '

In [0]:
# save model 
feature_spec = {
    'sentence': tf.io.FixedLenFeature([], tf.string)
}
serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
export_dir = dnn.export_savedmodel(path+'storage/models/export', serving_input_receiver_fn)
export_dir

b'/content/drive/My Drive/data/storage/models/export/1569699519'

In [5]:
import tensorflow_hub as hub
import tensorflow as tf
from joblib import dump, load
import pandas as pd

export_dir = b'/content/drive/My Drive/data/storage/models/export/1569699519'
# retrive model 
predict_fn = tf.contrib.predictor.from_saved_model(export_dir)

inputs = pd.DataFrame({
    'sentence': ['documents required at the time of interview in sbi','white felame is the best human'],
})

examples = []
for index, row in inputs.iterrows():
    feature = {}
    for col, value in row.iteritems():
        value = str.encode(value)
        feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
    example = tf.train.Example(
        features=tf.train.Features(
            feature=feature
        )
    )
    examples.append(example.SerializeToString())

predictions = predict_fn({'inputs': examples})
for score in predictions['scores']:
    if score[0] > score[1]:
        print('sincere')
    else :
        print('insincere')


INFO:tensorflow:Restoring parameters from /content/drive/My Drive/data/storage/models/export/1569699519/variables/variables
sincere
insincere


# utils.py

In [0]:
import nltk
import re
import contractions

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize 
from nltk.corpus import  wordnet, stopwords 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump, load

import gensim
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, roc_curve, auc 
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score, zero_one_loss

import tensorflow as tf
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.models import Sequential, model_from_json
from tensorflow.python.keras.layers import Dense, GRU, Embedding, Dropout, Activation
from tensorflow.python.keras import metrics
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


def remove_white_space(text):
    return text.strip().strip('\t\n')

def remove_special_character(text):
    return re.sub('[^A-Za-z0-9\s]+', '', text)

def threeway_split(X, y):
    X_train, X_hold, y_train, y_hold  = train_test_split(X, y, 
                                                     train_size = 0.8, test_size = 0.2, 
                                                     random_state = 42, stratify = y)
    X_dev, X_test, y_dev, y_test  = train_test_split(X_hold, y_hold, 
                                                     train_size = 0.5, test_size = 0.5,  
                                                     random_state = 42, stratify = y_hold)

    print(len(X_train),len(X_dev), len(X_test))
    del X_hold, y_hold
    return X_train, X_dev, X_test, y_train, y_dev, y_test

def remove_white_space(text):
    return text.strip().strip('\t\n')

def remove_special_character(text):
    return re.sub('[^A-Za-z0-9\s]+', '', text)


def report(y, predicted):
    target_names = ['Sincere', 'Insincere']
        
    #classification_report 
    report = classification_report(y, predicted, target_names = target_names)
    print(report)
    
    #confusion matrix
    matrix = confusion_matrix(y, predicted)
    fig, ax = plt.subplots(figsize = (5,5))
    sns.heatmap(matrix, annot = True, fmt = 'd')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    return report, matrix
    
def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()
    
    
def plot_roc(y, predicted):
    #roc curve
    fpr, tpr, thresholds = roc_curve(y, predicted, pos_label = 1)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color ='blue', lw = 1, label = 'ROC curve for sincere (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color ='black', lw = 1, linestyle = '--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc = "lower right")
    plt.show()
    return roc_auc 
  
def store_matrix(name, y, pred):
    matrix_s = load(path +'matrix_project.sav')
    matrix_s[name] = {
        'Accuracy':accuracy_score(y, pred),
        'AUC':roc_auc_score(y, pred),
        'Precision (macro)':precision_score(y, pred, average='macro'),
        'Recall (macro)':recall_score(y, pred,average='macro'),
        'f1 (macro)':f1_score(y, pred, average='macro'),
        'misclassifications':zero_one_loss(y, pred)
    }
    dump(matrix_s, path+'matrix_project.sav')
    
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        #print("OOV, cannot compute similarity with no input %s", words)
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, docs):
    return np.vstack([word_averaging(wv, doc) for doc in docs])
  
def  word_doc2vec_list(model, docs):
    return np.vstack([model.infer_vector(doc.split()) for doc in docs])
    
## FUNCTIONS TAKEN FROM https://www.kaggle.com/gmhost/gru-capsule

def load_glove(word_index):
    EMBEDDING_FILE = path+"embeddings/glove.840B.300d/glove.840B.300d.txt"
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size = all_embs.shape[1]

    nb_words = min(num_words, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= num_words: continue
        embedding_vector = embeddings_index.get(word)
        #ALLmight
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embeddings_index.get(word.capitalize())
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
    return embedding_matrix 
    
            
def load_fasttext(word_index):    
    EMBEDDING_FILE = path+"embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec"
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    nb_words = min(num_words, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= num_words: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

def load_para(word_index):
    EMBEDDING_FILE = path+'embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.0053247833,0.49346462
    embed_size = all_embs.shape[1]

    nb_words = min(num_words, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= num_words: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix
  
# word limits
num_words = 8000 
# feature size
embedding_size = 300
# mean + 2 std
max_tokens = 30
pad = 'post'
path = '/content/drive/My Drive/data/'
