## CNN training

This script for CNN experiments on rescue detection

## Imports

In [80]:
### imports (1) ##
import pandas as pd
import numpy as pn
from numpy import mean

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

import string

import collections
from collections import Counter

### imports (2) ##
from string import punctuation
from os import listdir
from numpy import array

from pickle import load
from numpy import array

### imports (3) ##
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

### imports (4) ##
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

import pickle

%matplotlib inline

## Utils

In [2]:
##################################################
#### functions to convert labels to numerical ####
##################################################
def class2Index(classList,class2index):
    return [class2index[c] for c in classList]


def train_classes(classes):
    class2index = {}
    index2class = {}
    classCount = 0
    for cl in np.unique(classes):
        if cl not in class2index:
            class2index[cl] = classCount
            index2class[classCount] = cl
            classCount += 1
            
    return class2index,index2class

In [3]:
##################################################
#### functions to clean the corpus ###############
##################################################
def remove_punc(text): 
    text = "".join([char for char in text if char not in string.punctuation ])
    #text = re.sub('[0-9]+', '', text)
    return text

def tokenization(text):
    text = re.split('\W+',text)
    return text

def remove_url(text):
    text = re.sub(r'http\S+', '', text)
    return(text)

def lower_case(text):
    text = text.lower()
    return(text)

def remove_stopwords(text):
    text = [word for word in text if word not in STOPWORDS]
    return text


def clean_text(text): 
    # lower case
    text_lower = lower_case(text)
    
    # remove puntuation
    text_punc = remove_punc(text_lower) 
    
    #remove URLS
    text_url = remove_url(text_punc)
    
    # tokenization
    text_tokens = tokenization(text_url)
    
    # remove stop words
    no_stop_tokens = remove_stopwords(text_tokens)
    
    return no_stop_tokens

##### test the cleaning function on a test sentence ###############################################
sentence = 'THE weather 99 Is good https://www.google.com/ I make a sandwitch :D'
print('Original sentence -',sentence) 
sent = clean_text(sentence)
print('Cleaned...')
print('New sentence - ',' '.join(sent))
###################################################################################################

Original sentence - THE weather 99 Is good https://www.google.com/ I make a sandwitch :D
Cleaned...
New sentence -  weather 99 good make sandwitch


In [4]:
########################################################################################################
#########  function to clean and tokenize a corpus (to create a vocab given a corpus)  #################
###        input: corpus         output: clean tokens ##################################################
########################################################################################################
def clean_corpus(corpus):
    # convert all to lower case 
    corpus_lower = lower_case(corpus)
    
    # remove punctuation
    corpus_punc = remove_punc(corpus_lower) 
    # remove punctuation from each token
    #table = str.maketrans('', '', string.punctuation)
    #tokens = [w.translate(table) for w in tokens]
    
    #remove URLS
    corpus_url = remove_url(corpus_punc)
    
    # tokenization
    corpus_tokens = tokenization(corpus_url)
    # split into tokens by white space
    #tokens = corpus.split()
    
    # remove stop words
    no_stop_tokens = remove_stopwords(corpus_tokens)
    # filter out stop words
    # stop_words = set(stopwords.words('english'))
    # tokens = [w for w in tokens if not w in stop_words]
    
    # remove remaining tokens that are not alphabetic
    #tokens = [word for word in tokens if word.isalpha()]
    
    # filter out short tokens
    final_tokens = [word for word in no_stop_tokens if len(word) > 1]

    return final_tokens

########################################################################################################
#########  function to clean and tokenize a document based on a given vocabulary   #####################
###        input: document         output: clean document's tokens #####################################
########################################################################################################
def clean_document_vocab(doc, vocab):
    # convert to lower case 
    doc_lower = lower_case(doc)
    
    # remove punctuation
    doc_punc = remove_punc(doc_lower) 

    #remove URLS
    doc_url = remove_url(doc_punc)
    
    # tokenization
    doc_tokens = tokenization(doc_url)
    
    # remove stop words
    doc_no_stop_tokens = remove_stopwords(doc_tokens)
    
    # filter out tokens not in vocab
    tokens = [w for w in doc_no_stop_tokens if w in vocab]
    
    #tokens = ' '.join(tokens)
    return tokens

##### test on a corpus #####
ls = ['I love going to the beach at 999','Tunisia is the most beautiful country!!','here we are..']
corpus = " ".join(ls)
print('Corpus:',corpus)
print('Clean corpus ..')
c_corpus = clean_corpus(corpus)
print('Cleaned corpus --',c_corpus)

Corpus: I love going to the beach at 999 Tunisia is the most beautiful country!! here we are..
Clean corpus ..
Cleaned corpus -- ['love', 'going', 'beach', '999', 'tunisia', 'beautiful', 'country']


## Load Harvey Data set

In [5]:
labeledDF=pd.read_csv("/home/wkhal001/Desktop/data_rescue_mining/labeled_ds_Corrected_csv_updated_22_09_13.csv") 
del labeledDF['Unnamed: 0']

In [6]:
labeledDF

Unnamed: 0,id,status_id,created_at,text,address,loc,situ,save,sos,sos.pred,sos.correct
0,1,9.020000e+17,8/28/2017 11:19,#Harvey floods TV station #KHOU in #Houston. h...,0,,0,0,0,0,0
1,2,9.020000e+17,8/28/2017 16:58,"@RandiRhodes RR call him out for visiting SA, ...",0,,0,0,0,0,0
2,3,9.020000e+17,8/27/2017 15:35,Wow a tv station is flooding in Houston! So sc...,0,,1,0,0,0,0
3,4,9.020000e+17,8/29/2017 3:36,"My son, dil &amp; 2 grandkids in grand lakes, ...",0,,0,1,1,1,0
4,5,9.020000e+17,8/28/2017 12:09,is the beltway still flooded? ya boy need to g...,0,,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
5787,5788,9.020000e+17,8/29/2017 4:04,"Around 10,000,000,000,000 gallons of water fro...",0,,0,0,0,0,0
5788,5789,9.020000e+17,8/29/2017 4:09,The road to my residence is flooded. Thank God...,0,,0,0,0,0,0
5789,5790,9.020000e+17,8/29/2017 4:09,Texas road closures and flooding kept up to da...,0,,0,0,0,0,0
5790,5791,9.020000e+17,8/29/2017 4:22,"@HellerWeather Tim, any maps to show where flo...",1,,0,0,0,0,0


In [7]:
### clean our data set
labeledDF['cleaned_tweet'] = labeledDF['text'].apply(lambda x: " ".join(clean_text(x)))

In [8]:
## Extract useful columns 
df_training = labeledDF[['id','text','cleaned_tweet','sos.correct']]
df_training.columns= ['id','non_cleaned_text','text','label']

In [9]:
df_training

Unnamed: 0,id,non_cleaned_text,text,label
0,1,#Harvey floods TV station #KHOU in #Houston. h...,harvey floods tv station khou houston,0
1,2,"@RandiRhodes RR call him out for visiting SA, ...",randirhodes rr call visiting sa flooding mayor...,0
2,3,Wow a tv station is flooding in Houston! So sc...,wow tv station flooding houston scary sad rain...,0
3,4,"My son, dil &amp; 2 grandkids in grand lakes, ...",son dil amp 2 grandkids grand lakes katy tx wo...,0
4,5,is the beltway still flooded? ya boy need to g...,beltway still flooded ya boy need go pay bills,0
...,...,...,...,...
5787,5788,"Around 10,000,000,000,000 gallons of water fro...",around 10000000000000 gallons water harvey ins...,0
5788,5789,The road to my residence is flooded. Thank God...,road residence flooded thank god left safe sta...,0
5789,5790,Texas road closures and flooding kept up to da...,texas road closures flooding kept date,0
5790,5791,"@HellerWeather Tim, any maps to show where flo...",hellerweather tim maps show flooding would mas...,0


In [10]:
df_training['label'].value_counts()

0    5520
1     272
Name: label, dtype: int64

## Load Ian/Ida data set

In [5]:
####################################################
#### read mixed data - verified ####################
####################################################
path_to_mixed_verif = '/home/wkhal001/Desktop/Mixed_data_verified/Mixed_data__ian_ida_verified.csv'

mixed_ida_ian_verified=pd.read_csv(path_to_mixed_verif) 
del mixed_ida_ian_verified['Unnamed: 0']

In [6]:
mixed_ida_ian_verified['label'].value_counts()

0    4935
1     225
Name: label, dtype: int64

In [7]:
Data_mixed = mixed_ida_ian_verified[['text','label']]

In [9]:
Data_mixed['cleaned_tweet'] = Data_mixed['text'].apply(lambda x: " ".join(clean_text(x)))

In [12]:
## Extract useful columns 
df_training = Data_mixed[['text','cleaned_tweet','label']]

In [13]:
df_training.columns= ['non_cleaned_text','text','label']

## Vocabulary and tokenization

In [15]:
##################################################################################
# Create a vocabulary   (specify the corpus data frame (e.g., labeledDF) #########
##################################################################################
corpus = ''
for i in range(len(df_training)) :
    st = df_training.iloc[i]["non_cleaned_text"]
    corpus = corpus + " " + st
    

#corpus
T = clean_corpus(corpus)

### count vocabulary with collection counter ####
vocab = Counter()
vocab.update(T)

print('Vocabulary created...',len(vocab),' tokens')

# with open('vocab_crisis_bench.pickle', 'wb') as outputfile:
#     pickle.dump(vocab, outputfile)

Vocabulary created... 11722  tokens


## Upload Pretrained word embedding models

In [16]:
##################################################################################
########      Load pretrained embedding    #######################################
##################################################################################
path_to_glove = '/home/wkhal001/Desktop/gensim-data/glove-twitter-200/glove-twitter-200.txt' 
    
# Load the file content in a dictionary
embeddings_index = {}
with open(path_to_glove) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))
#embeddings_index['hurricane']

Found 1193515 word vectors.


In [17]:
#######################################################
###### create Keras tokenizer and fit text ############
#######################################################
tokenizer = Tokenizer(num_words=None,oov_token='OOV')
tokenizer.fit_on_texts(df_training['text'])
vocab_size = len(tokenizer.word_index) + 1
print('Tokenizer vocabulary size ...',vocab_size)

Tokenizer vocabulary size ... 11749


In [18]:
##################################################################################
########      Initialize the embedding layer    ##################################
##################################################################################
EMBEDDING_DIM=200
vocabulary_size=len(tokenizer.word_index)+1
print('Vocaublary size:',vocabulary_size)
print('Embedding dimension:',EMBEDDING_DIM)
print('Create embedding matrix in progress....')
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Embedding matrix created')
print('Embedding matrix size',len(embedding_matrix))
print('\n')

Vocaublary size: 11749
Embedding dimension: 200
Create embedding matrix in progress....
Embedding matrix created
Embedding matrix size 11749




## Define CNN architecture

In [19]:
##########################################################################
# Define encoder architecture Kim model  #################################
##########################################################################
def create_kim_encoder(length, vocab_size=100, embedding=False, embed_params={}):
    ## inputs #####
    inputs1 = Input(shape=(length,))
    
    ## Create the embedding layer  ######################################
    if embedding == True:   
        embedding_layer = Embedding(embed_params['Tokenizer_size'] + 1,
                            embed_params['EMBEDDING_DIM'],
                            weights=[embed_params['weights']],
                            input_length=embed_params['input_length'],
                            trainable=False)
        embedding1 = embedding_layer(inputs1)
    else: 
        embedding1 = Embedding(vocab_size, 100)(inputs1)
    

    # channel 1
    #embedding1 = Embedding(vocab_size, 100)(inputs1)
    embedding1 = embedding_layer(inputs1)
    conv1 = Conv1D(filters=1024, kernel_size=2, activation='relu')(embedding1)
    #drop1 = Dropout(0.3)(conv1)
    pool1 = MaxPooling1D(pool_size=8)(conv1)
    flat1 = Flatten()(pool1)

    # channel 2
    #inputs2 = Input(shape=(length,))
    #embedding2 = Embedding(vocab_size, 100)(inputs1)
    embedding2 = embedding_layer(inputs1)
    conv2 = Conv1D(filters=128, kernel_size=16, activation='relu')(embedding2)
    #drop2 = Dropout(0.3)(conv2)
    pool2 = MaxPooling1D(pool_size=6)(conv2)
    flat2 = Flatten()(pool2)

    # channel 3
    #inputs3 = Input(shape=(length,))
    #embedding3 = Embedding(vocab_size, 100)(inputs1)
    embedding3 = embedding_layer(inputs1)
    conv3 = Conv1D(filters=16, kernel_size=14, activation='relu')(embedding3)
    #drop3 = Dropout(0.3)(conv3)
    pool3 = MaxPooling1D(pool_size=6)(conv3)
    flat3 = Flatten()(pool3)
    
    # channel 4
    #inputs3 = Input(shape=(length,))
    #embedding4 = Embedding(vocab_size, 100)(inputs1)
    # embedding4 = embedding_layer(inputs1)
    # conv4 = Conv1D(filters=700, kernel_size=2, activation='relu')(embedding3)
    # #drop4 = Dropout(0.5)(conv4)
    # pool4 = MaxPooling1D(pool_size=2)(conv4)
    # flat4 = Flatten()(pool4)

    # merge
    union = concatenate([flat1, flat2, flat3])
    #union = union.reshape(union.size(0), -1)
    
    # interpretation
    #dense1 = Dense(300, activation='relu')(merged)
    #outputs = Dense(1, activation='sigmoid')(dense1)
    model = keras.Model(inputs=inputs1, outputs=union) #[inputs1, inputs2, inputs3]

    # summarize
    #print(model.summary())
    #plot_model(model, show_shapes=True, to_file='MMD_kim_encoder_kernel_3_4_8.png')
  
    return model

##########################################################################
# Define encoder architecture single channel  ############################
##########################################################################
def create_single_channel_cnn_encoder(length, vocab_size=100):
    # channel 1
    inputs1 = Input(shape=(length,))

    # channel 1
    #embedding1 = embedding_layer(vocab_size, 100)(inputs1)
    embedding1 = embedding_layer(inputs1)
    conv1 = Conv1D(filters=128, kernel_size=8, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    
    flat1 = Flatten()(pool1)
    model = keras.Model(inputs=inputs1, outputs=flat1) #[inputs1, inputs2, inputs3] outputs=union

    # summarize
    print(model.summary())
    plot_model(model, show_shapes=True, to_file='MMD_single_channel_encoder_kernel_8.png')
  
    return model

In [20]:
##########################################################################
###  Create a classifier on top of the CNN architectures #################
##########################################################################
def create_classifier(encoder, trainable=True):
    num_classes=2
    input_shape = (128,)
    hidden_units = 64
    
    for layer in encoder.layers:
        layer.trainable = trainable
        
    
    inputs = keras.Input(shape=input_shape)
    features = encoder(inputs)
    features = layers.Dropout(0.3)(features)
    features = layers.Dense(64, activation="relu")(features)
    #features = layers.Dropout(dropout_rate)(features)
    #features = layers.Dense(50, activation="relu")(features)
    #features = layers.Dropout(dropout_rate)(features)
    outputs = layers.Dense(num_classes, activation="softmax")(features)
    
    # inputs = keras.Input(shape=input_shape)
    # features = encoder(inputs)
    # features = layers.Dropout(dropout_rate)(features)
    # features = layers.Dense(hidden_units, activation="relu")(features)
    # features = layers.Dropout(dropout_rate)(features)
    # outputs = layers.Dense(num_classes, activation="softmax")(features)

    model = keras.Model(inputs=inputs, outputs=outputs, name="cnn-glove-classifier-harvey")
    return model


## 10-fold cross validation

In [21]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from transformers import AdamW
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from Baseline_Models import Display_metrics,Display_classification_report,Confusion_matrix

import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score

# specify GPU
device = torch.device("cuda")
device

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay
%matplotlib inline

import numpy as np
from sklearn.metrics import average_precision_score

device(type='cuda')

In [22]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay
%matplotlib inline

import numpy as np
from sklearn.metrics import average_precision_score


In [23]:
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=2018)

X = df_training['text']
TX = np.array(X.tolist())

Y = df_training['label']
TY= np.array(Y.tolist())

print(skf)

for train_index, test_index in skf.split(TX,TY):
    print('--------- Fold ',str(rep_fold),'-------------------------')
    print('Length train index....',len(train_index))
    print('Length test index....',len(test_index))
    
    #### training/testing data ########################################
    X_train, X_test = TX[train_index], TX[test_index]
    y_train, y_test = TY[train_index], TY[test_index]

StratifiedKFold(n_splits=10, random_state=2018, shuffle=True)


In [26]:
rep_fold = 1
padd_len = 128
ls_save_results = []

pos_proba_ls = []
y_tests_ls = []
preds_ls = []

################################################
######### Classifier parameters ################
################################################
learning_rate = 0.0005
batch_size = 128
#hidden_units = 256
num_epochs = 50
dropout_rate = 0.3
num_classes = 2
################################################
################################################
################################################

for train_index, test_index in skf.split(TX,TY):
    print('--------- Fold ',str(rep_fold),'-------------------------')
    print('Length train index....',len(train_index))
    print('Length test index....',len(test_index))
    
    #### training/testing data ########################################
    X_train, X_test = TX[train_index], TX[test_index]
    y_train, y_test = TY[train_index], TY[test_index]
    
    Train_X_cv, val_X, Train_Y_cv, val_Y = train_test_split(X_train,y_train, 
                                                            random_state=2018, 
                                                            test_size=0.1,
                                                            stratify=y_train)
        
    ####### check distribution of positive samples on each fold ######
    count_test = (y_test == 1).sum()
    count_train = (Train_Y_cv == 1).sum()
    count_val = (val_Y == 1).sum()
    print('postive samples in train......',count_train)
    print('postive samples in validation......',count_val)
    print('postive samples in test......',count_test)
    print('------------------------------------------')
    ###################################################################
    
    train_documents = list()
    for row in Train_X_cv: 
        tweet =  row   #row["text"]
        tokens = clean_document_vocab(tweet, vocab)
        train_documents.append(tokens)
    print('cleaned train docs (in tokens) loaded....')
    max_length1 = max([len(s) for s in train_documents])
    print('train max....',max_length1)
    
    validation_documents = list()
    for row in val_X: 
        tweet =  row   #row["text"]
        tokens = clean_document_vocab(tweet, vocab)
        validation_documents.append(tokens)
    print('cleaned validation docs (in tokens) loaded....')
    max_length1 = max([len(s) for s in validation_documents])
    print('train max....',max_length1)
    
    test_documents = list()
    for row in X_test: 
        tweet =  row   #row["text"]
        tokens = clean_document_vocab(tweet, vocab)
        test_documents.append(tokens)
    print('cleaned test docs (in tokens) loaded....')
    max_length1 = max([len(s) for s in test_documents])
    print('test max....',max_length1)
    
    # covert input text to sequence of indices
    train_encoded_docs = tokenizer.texts_to_sequences(train_documents)
    val_encoded_docs = tokenizer.texts_to_sequences(validation_documents)
    test_encoded_docs = tokenizer.texts_to_sequences(test_documents)

    Xtrain = pad_sequences(train_encoded_docs, maxlen=padd_len, padding='post')
    Xval = pad_sequences(val_encoded_docs, maxlen=padd_len, padding='post')
    Xtest = pad_sequences(test_encoded_docs, maxlen=padd_len, padding='post')

    train_labels = Train_Y_cv.tolist()
    dev_labels = val_Y.tolist()
    test_labels = y_test.tolist()

    ###################################################################
    ############## Create CNN model in Keras ##########################
    ###################################################################
    embed_params={'EMBEDDING_DIM':EMBEDDING_DIM,'Tokenizer_size':len(tokenizer.word_index),'weights':embedding_matrix,'input_length':128}
    encoder = create_kim_encoder(128, vocab_size,True,embed_params)
    classifier = create_classifier(encoder)
    classifier.summary()
    
    ###################################################################
    ############## Train CNN model  ###################################
    ###################################################################
    training_padded = np.array(Xtrain)
    training_labels = np.array(train_labels)

    validation_padded = np.array(Xval)
    validation_labels = np.array(dev_labels)
    
    testing_padded = np.array(Xtest)
    testing_labels = np.array(test_labels)

    # compile classifier
    classifier.compile(
        optimizer=keras.optimizers.Adam(learning_rate),
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
    )

    # train classifier
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=15,restore_best_weights=True)
    history = classifier.fit(x=training_padded,validation_data= (validation_padded,validation_labels), y=training_labels, batch_size=batch_size, epochs=num_epochs,callbacks=[callback])
    
    ###### Save model ####################################
    model_name = 'cnn_ian_ida_fold'+ str(rep_fold) + '.h5'
    classifier.save(model_name)
    ######################################################
    
    ###################################################################
    ############## Predict using CNN model  ###########################
    ###################################################################
    print('calculate preds....')
    predictions = classifier.predict(testing_padded)
    preds = np.argmax(predictions, axis=1)
    
    dict_r = classification_report(y_test.tolist(), preds, output_dict = True)

    ### calculate probs for precision-recall curve calculation ######
    pos_probs = predictions[:, 1]
    pos_proba_ls.append(pos_probs)
    y_tests_ls.append(y_test.tolist())
    preds_ls.append(preds)
    #################################################################
        
    
    ###### calculate precision-recall curve and metric #########
    _fold_AP = average_precision_score(y_test.tolist(),pos_probs)
    print("AP score for class 1 --->",_fold_AP)
    ############################################################
    
    
    ### calculate results ####################################
    _f1 = dict_r['1']['f1-score']
    _recall = dict_r['1']['recall']
    _precision = dict_r['1']['precision']
    ##########################################################

    print('f1-score......',_f1)
    print('recall......',_recall)
    print('recall......',_precision)
    
    fold_results = {'report':dict_r,'f1':_f1,'recall':_recall,'precision':_precision,'AP':_fold_AP,'AP_list':pos_probs,'ytest':y_test.tolist()}
    ls_save_results.append(fold_results)


    rep_fold = rep_fold + 1
    print('------------------------------------------------------')
    

--------- Fold  1 -------------------------
Length train index.... 4644
Length test index.... 516
postive samples in train...... 182
postive samples in validation...... 20
postive samples in test...... 23
------------------------------------------
cleaned train docs (in tokens) loaded....
train max.... 77
cleaned validation docs (in tokens) loaded....
train max.... 44
cleaned test docs (in tokens) loaded....
test max.... 33
Model: "cnn-glove-classifier-harvey"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 128)]             0         
                                                                 
 model_4 (Functional)        (None, 17968)             3214968   
                                                                 
 dropout_4 (Dropout)         (None, 17968)             0         
                                                                 
 dense_

In [27]:
f1_scores =[]
recall_scores = []
precision_scores = []
AP_scores =[]

for i in ls_save_results:
    AP_scores.append(i['AP'])
    f1_scores.append(i['report']['1']['f1-score'])
    recall_scores.append(i['report']['1']['recall'])
    precision_scores.append(i['report']['1']['precision'])
