#### Loading input data
Load documents from the files, placed under given root folders (for training and testing separately).        
Files wich belong to some specific category are placed into the same subfolder of the root. 
Name of such subfolder is treated as a name of category.    
File which belongs to few categories should be copied into all correspondig subfolders (but will be loaded only once).

In [1]:
import numpy
import gensim
from gensim.models.word2vec import Word2Vec
from keras.preprocessing import sequence
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple
from keras.models import load_model
import statistics
import random
import glob
import os
import datetime
from pathlib import Path

homePath = str(Path.home()) + "/MLClassificationData"
LabeledSentence = gensim.models.doc2vec.TaggedDocument
# Dimension of word vectors
n_dim = 100
# Count of categories (labels).
n_cats = 0
# Count of words, which aren't found in w2v vocabulary.
nf_words = 0
# This id is included into the names of models, created by this notebook.
modelId = 1
# Path to tokenized corpus for training
trainRoot = homePath + '/train/rtanews/target'
# Path to tokenized corpus for testing
testRoot = homePath + '/test/rtanews/target'
# Path for saving best results, achieved in the time of training
best_models_path = homePath + '/models/rtanews/tempModels/'
# Part of the training corpus, used for runtime validation
valPart = 0.15
batch_size = 128
random.seed(1)
cnt = 0
nCats = 0
categories = dict()

LabeledDocument = namedtuple('LabeledDocument', 'words labels qLabs')

def getCategories(path):
    cats = dict()
    nCats = 0
    os.chdir(path)
    for f in glob.glob("*"):
        if os.path.isdir(f):
            cats[f] = nCats
            nCats += 1
    return cats

def prepareDocsData(path, cats):
    files = dict()
    fInCats = [0] * len(cats)
    nFiles = 0
    actFiles = 0
    curCategory = 0
    docs = []
    os.chdir(path)
    rootDir = os.getcwd()
    for f in glob.glob("*"):
        curCategory = cats[f]
        catPath = path + "/" + f
        os.chdir(catPath)
        for fc in glob.glob("*"):
            actFiles += 1
            if fc not in files:
                nFiles += 1
                fPath = catPath + "/" + fc
                docCont = ''
                with open(fc, 'r', encoding='UTF-8') as tc:
                    for line in tc:
                        docCont += line.strip() + " "
                tc.close()
                words = docCont.strip().split() 
                labels = [0] * len(cats)
                labels[curCategory] = 1
                files[fc] = LabeledDocument(words, labels, [1])
            else:
                files[fc].labels[curCategory] = 1
                files[fc].qLabs[0] += 1
            fInCats[curCategory] += 1
    for k, val in files.items():
        docs.append(val)
    return docs, fInCats

def getLabelSets(docs):
    labels = [x[1] for x in docs]
    results = [labels[0]]
    qLabs = 0
    for i in range(len(labels)):        
        if i%1000 == 0:
            print (str(i), end='\r')
        qLabs += sum(labels[i])
        count = 0
        for j in range(len(results)):
            for k in range(len(categories)):
                if labels[i][k] != results[j][k]:
                    count += 1
                    break
        if count == len(results):
            results.append(labels[i])
    return len(results), qLabs
    
def showTime(ds,de):
    result = ''
    seconds = (de-ds).total_seconds()
    if seconds < 1:
        return "less than 1 sec"
    hh = int(seconds/(60*60));
    if hh > 0:
        result = "%d h:"%(hh);
    seconds -= hh*60*60
    mm = int(seconds/60);
    if mm > 0:
        result += "%d min:"%(mm)
    ss = seconds - mm*60;
    result += "%d sec"%(ss)
    return result

ds = datetime.datetime.now()
categories = getCategories(trainRoot)
#print (categories)
trainAllDocs, fInCats1 = prepareDocsData(trainRoot, categories)
trainAllDocs = random.sample(trainAllDocs, len(trainAllDocs))
trainDocs = trainAllDocs[:int(len(trainAllDocs) * (1 - valPart))]
valDocs = trainAllDocs[int(len(trainAllDocs) * (1 - valPart)):]
testDocs, fInCats2 = prepareDocsData(testRoot, categories)
testDocs = random.sample(testDocs, len(testDocs))
maxDocLen = max(len(x.words) for x in trainAllDocs)
minDocLen = min(len(x.words) for x in trainAllDocs)
avrgDocLen = round(statistics.mean(len(x.words) for x in trainAllDocs), 2)
de = datetime.datetime.now()

print ("Load input data in %s"%(showTime(ds, de))) 
print ("Dataset properties:")
dls, qLabs = getLabelSets(trainAllDocs)
print ('Loaded %d documents: %d for training, %d for validation, %d for test' % (len(trainAllDocs) + len(testDocs), len(trainDocs), len(valDocs), len(testDocs)))
print ("Tokens in documents: maximum: %d, minimum: %d, average: %d"%(maxDocLen, minDocLen, avrgDocLen))
print ("Categories: %d"%(len(categories)))
print ("Documents for training in category : maximum: %d, minimum: %d, avegare: %d"%(max(fInCats1), min(fInCats1), round(statistics.mean(fInCats1), 2)))
print ("Documents for testing  in category : maximum: %d, minimum: %d, avegare: %d"%(max(fInCats2), min(fInCats2), round(statistics.mean(fInCats2), 2)))

print ("Distinct Label Set: %d"%(dls))
print ("Proportion of Distinct Label Set: %.4f"%(dls/len(trainAllDocs)))
print ("Label Cardinality: %.4f"%(qLabs/len(trainAllDocs)))
print ("Label Density: %.4f"%(qLabs/len(trainAllDocs)/len(categories)))

del trainAllDocs

Using TensorFlow backend.


Load input data in 4 sec
Dataset properties:
Loaded 23837 documents: 12750 for training, 2251 for validation, 8836 for test
Tokens in documents: maximum: 1330, minimum: 46, average: 170
Categories: 40
Documents for training in category : maximum: 2800, minimum: 93, avegare: 415
Documents for testing  in category : maximum: 1501, minimum: 65, avegare: 276
Distinct Label Set: 290
Proportion of Distinct Label Set: 0.0193
Label Cardinality: 1.1073
Label Density: 0.0277


#### Load W2V model
Load Word2Vec model, saved previously in a text format.

In [2]:
# Name of file with word vectors
#modelName = 'model-2018-Nov-05-173949.vec'
modelName = 'wiki_ar.vec'
ds = datetime.datetime.now()
w2v = gensim.models.KeyedVectors.load_word2vec_format(homePath + '/w2v/vectors/' + modelName)
de = datetime.datetime.now()
print ("Load W2V model (%s) in %s"%(modelName, showTime(ds, de))) 
print ("Continue")

Load W2V model (wiki_ar.vec) in 1 min:14 sec
Continue


#### Prepare data for training and testing
Input data is converted into array of sequences so that documents of different length are represented by sequences of numbers of the same size.

In [3]:
nf_words = 0
sdict = dict()
nfw = []
def getDocsArray(tokens, wvModel, dataType):
    global n_dim
    global tmpCount
    global nf_words
    global sdict
    global nfw
    tmpCount = tmpCount + 1
    if tmpCount != 0 and tmpCount%1000 == 0:
        print(dataType + ": prepare ", tmpCount, end="\r")
        
    vec = numpy.zeros(n_dim).reshape((1, n_dim))
    count = 0.
    for word in tokens:
        if word not in sdict:
            sdict[word] = 1
        else:
            sdict[word] = sdict[word] + 1
    for word in tokens:
        try:
            vec += wvModel[word].reshape((1, n_dim)) 
            count += 1.
        except KeyError:
            if sdict[word] == 1:
                nf_words += 1
                if nf_words < 10:
                    nfw.append(word)
            continue
    if count != 0:
        vec /= count
    return vec
    
ds = datetime.datetime.now()
tmpCount = 0
train_arrays = numpy.concatenate([getDocsArray(x.words, w2v, 'Train') for x in trainDocs])
tmpCount = 0
val_arrays = numpy.concatenate([getDocsArray(x.words, w2v, 'Validation') for x in valDocs])
train_labels = numpy.concatenate([numpy.array(x.labels).reshape(1, len(categories)) for x in trainDocs])
val_labels = numpy.concatenate([numpy.array(x.labels).reshape(1, len(categories)) for x in valDocs])
tmpCount = 0
de = datetime.datetime.now()
print ("Prepare train and validation data in %s"%(showTime(ds, de)))

ds = datetime.datetime.now()
test_arrays = numpy.concatenate([getDocsArray(x.words, w2v, "Test") for x in testDocs])
test_labels = numpy.concatenate([numpy.array(x.labels).reshape(1, len(categories)) for x in testDocs])
de = datetime.datetime.now()
print ("Prepare test data in %s"%(showTime(ds, de)))
print ("Unique words in all documents: %d"%(len(sdict)))
print ("Words not found in the w2v vocabulary: %d"%(nf_words))

Prepare train and validation data in 1 min:2 sec
Prepare test data in 38 sec
Unique words in all documents: 155089
Words not found in the w2v vocabulary: 90398


#### Create, train and test the model
Classification model is created here as an linear stack of few regular densely-connected neural network's layers.

In [4]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import ModelCheckpoint
from keras import optimizers
from keras.metrics import categorical_accuracy
import math

def getModel(n_dim):
    global batch_size
    model = Sequential()
    model.add(Dense(256, activation='relu', input_dim=n_dim))
    model.add(Dropout(0.2))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(len(categories), activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model    

model = getModel(n_dim)
# Quantity of epochs (loops) in which model is trained
eps = 30
verbose = 0
checkpoint = ModelCheckpoint(best_models_path + 'curModel%d.hdf5'%(modelId), monitor='val_acc', verbose=verbose, save_best_only=True, mode='auto')
print("Start training...", end='\r')

ds = datetime.datetime.now()
model.fit(train_arrays, train_labels, epochs=eps, validation_data=(val_arrays, val_labels), batch_size=batch_size, verbose=0, callbacks=[checkpoint], shuffle=False)
de = datetime.datetime.now()
print ("Model trained in %s"%(showTime(ds, de)))

# Evaluation of the final model
scores = model.evaluate(test_arrays, test_labels, verbose=1)
print("Final model accuracy: %.2f%%" % (scores[1]*100))

# Evaluation of the best model, saved in the training time
model1 = load_model(best_models_path + 'curModel%d.hdf5'%(modelId))
scores = model1.evaluate(test_arrays, test_labels, verbose=1)
print("Last saved model accuracy: %.2f%%" % (scores[1]*100))

Model trained in 38 sec
Final model accuracy: 98.48%
Last saved model accuracy: 98.49%


#### Save the model

In [5]:
# Set 'False' if you want to use the best model, saved in the training time
useFinalModel = True

modelPath = homePath + "/models/rtanews/models/"
modelName = "model%d-%s"%(modelId, datetime.datetime.now().strftime("%Y-%b-%d-%H%M%S"))

ds = datetime.datetime.now() 
if not useFinalModel:
    del model
    model = model1    
model.save(modelPath + modelName)
os.remove(best_models_path + 'curModel%d.hdf5'%(modelId))
de = datetime.datetime.now()
print ("Model %s saved in %s"%(modelName, showTime(ds,de)))

Model model1-2018-Nov-11-203644 saved in less than 1 sec


#### Check results visually

In [6]:
cNames = [''] * len(categories)
step = 100

def getPrediction(entry):
    return entry[1]

for k,v in categories.items():
    cNames[v] = k
for id in range(0, len(test_arrays)-1, step):
    if id > 0:
        print()
    taggedFor = "Tagged for: "
    cTags = 0
    for i in range(len(categories)):
        if test_labels[id][i] == 1:
            if cTags != 0:
                taggedFor += " , "
            cTags += 1        
            taggedFor += cNames[i]        
    res = model.predict(test_arrays[id].reshape(1, n_dim))
    list = [(0,0) for i in range(len(categories))]
    for i in range(len(categories)):
        list[i] = (i, res[0][i])
    list.sort(key=getPrediction, reverse=True)
    print ("Document %d %s"%(id+1, taggedFor))
    print ("Predictions (with probability of more than one percent):")
    for i in range(len(categories)):
        if list[i][1] >= 0.01:
            print ("\t%35s  %.2f%%" % (cNames[list[i][0]], list[i][1] * 100))

Document 1 Tagged for: 19.Olympics_Ryu_d_Janero_2016 , 23.Sports_Other
Predictions (with probability of more than one percent):
	      19.Olympics_Ryu_d_Janero_2016  89.37%
	                    23.Sports_Other  65.86%

Document 101 Tagged for: 07.Crisis_Syrian conflict , 03.Weapons_and_military_equipment
Predictions (with probability of more than one percent):
	          07.Crisis_Syrian conflict  57.67%
	                   25.Army_Aircraft  18.54%
	  03.Weapons_and_military_equipment  16.97%
	                         24.Rockets  16.18%
	      15.Opposition_Syrian conflict  5.40%
	                    22.Groups_armed  3.05%
	              37.Military_Maneuvers  2.49%
	          13.Technology_Information  1.87%

Document 201 Tagged for: 17.Diseases
Predictions (with probability of more than one percent):
	             36.General_information  12.34%
	                     05.Discoveries  8.05%
	                11.Research_Medical  7.82%
	                        17.Diseases  5.66%
	        

	                        30.Football  99.77%
	                    23.Sports_Other  1.31%

Document 7601 Tagged for: 34.Famous
Predictions (with probability of more than one percent):
	                          34.Famous  88.53%
	             36.General_information  12.81%
	                           38.Music  6.06%

Document 7701 Tagged for: 13.Technology_Information
Predictions (with probability of more than one percent):
	          13.Technology_Information  98.69%
	                     05.Discoveries  3.32%
	             36.General_information  2.96%

Document 7801 Tagged for: 05.Discoveries , 13.Technology_Information
Predictions (with probability of more than one percent):
	          13.Technology_Information  80.08%
	             36.General_information  33.58%
	                     05.Discoveries  7.61%
	                11.Research_Medical  3.00%
	                          21.Crimes  1.38%
	                    22.Groups_armed  1.20%

Document 7901 Tagged for: 04.Oil_markets
Predi

#### Calculate different metrics
All metrics are calculated using the following indicator function: some label is treated as predicted, if its probability isn't less than some threshold.    
We set 50% as a value of this threshold.

In [8]:
rankThreshold = 0.5

def rankIndicator(labels, predictions, index):
    global rankThreshold
    return (labels[index] == 1), (predictions[index] >= rankThreshold)
    
def getPrediction(entry):
    return entry[1]

ds = datetime.datetime.now()
res = model.predict(test_arrays)
de = datetime.datetime.now()
print ("Dataset containing %d documents predicted in %s\n"%(len(test_arrays), showTime(ds, de)))

#Exact Match Ratio
wrongPreds = 0
for i in range(len(test_labels)):
    for j in range(len(categories)):
        actual, predicted = rankIndicator(test_labels[i], res[i], j)
        if (actual and not predicted) or (predicted and not actual):
            wrongPreds += 1
            break;
print ("Exact Match Ratio:  %.2f%%" % ((len(test_labels) - wrongPreds)/len(test_labels) * 100))

#Accuracy
accuracy = 0.
for i in range(len(test_labels)):
    labels = sum(test_labels[i])
    tp = 0
    tfp = 0
    for j in range(len(categories)):
        actual, predicted = rankIndicator(test_labels[i], res[i], j)
        if actual and predicted:
            tp += 1
        if predicted and not actual:
            tfp += 1
    accuracy += tp / (labels + tfp)
print ("Accuracy:  %.2f%%" % (accuracy / len(test_labels) * 100))  

#Precision
precision = 0.
for i in range(len(test_labels)):
    labels = sum(test_labels[i])
    tp = 0
    tfp = 0
    for j in range(len(categories)):
        actual, predicted = rankIndicator(test_labels[i], res[i], j)
        if actual and predicted:
            tp += 1
    precision += tp / labels
print ("Precision:  %.2f%%" % (precision / len(test_labels) * 100))  

#Recall
recall = 0.
for i in range(len(test_labels)):
    labels = sum(test_labels[i])
    tp = 0
    tfp = 0
    for j in range(len(categories)):
        actual, predicted = rankIndicator(test_labels[i], res[i], j)
        if actual and predicted:
            tp += 1
        if predicted:
            tfp += 1
    if tfp > 0:
        recall += tp / tfp
print ("Recall:  %.2f%%" % (recall / len(test_labels) * 100))  

#F1-Measure
f1 = 0.
for i in range(len(test_labels)):
    labels = sum(test_labels[i])
    tp = 0
    tfp = 0
    for j in range(len(categories)):
        actual, predicted = rankIndicator(test_labels[i], res[i], j)
        if actual and predicted:
            tp += 1
        if predicted:
            tfp += 1
    f1 += 2 * tp / (tfp + labels)
print ("F1-Measure:  %.2f%%" % (f1 / len(test_labels) * 100))

#Hamming Loss
hl = 0.
for i in range(len(test_labels)):
    labels = sum(test_labels[i])
    for j in range(len(categories)):
        actual, predicted = rankIndicator(test_labels[i], res[i], j)
        if (actual and not predicted) or (predicted and not actual):
            hl += 1
print ("Hamming Loss:  %.2f%%" % (hl * 100 / (len(test_labels) * len(categories)))) 

#Macro-Averaged Precision
precision = 0
for i in range(len(categories)):
    tp = 0
    tact = 0
    for j in range(len(test_labels)):
        actual, predicted = rankIndicator(test_labels[j], res[j], i) 
        if not actual:
            continue
        tact += 1
        if predicted:
            tp += 1
    precision += tp / tact
print ("Macro-Averaged Precision:  %.2f%%" % (precision / len(categories) * 100))  

#Macro-Averaged Recall
recall = 0
for i in range(len(categories)):
    tp = 0
    tact = 0
    for j in range(len(test_labels)):
        actual, predicted = rankIndicator(test_labels[j], res[j], i)
        if predicted:
            tact += 1
            if actual:
                tp += 1
    recall += tp / tact
print ("Macro-Averaged Recall:  %.2f%%" % (recall / len(categories) * 100))  

#Macro-Averaged F1-Measure
f1 = 0
for i in range(len(categories)):
    tp = 0
    tact = 0
    labs = 0
    for j in range(len(test_labels)):
        actual, predicted = rankIndicator(test_labels[j], res[j], i)
        if actual:
            labs += 1
        if predicted:
            tact += 1
            if actual:
                tp += 1
    f1 += 2 * tp / (tact + labs)
print ("Macro-Averaged F1-Measure:  %.2f%%" % (f1 / len(categories) * 100))  

#Micro-Averaged Precision
precision = 0
tp = 0
tact = 0
for i in range(len(categories)):
    for j in range(len(test_labels)):
        actual, predicted = rankIndicator(test_labels[j], res[j], i) 
        if not actual:
            continue
        tact += 1
        if predicted:
            tp += 1
precision += tp / tact
print ("Micro-Averaged Precision:  %.2f%%" % (precision * 100))  

#Micro-Averaged Recall
recall = 0
tp = 0
tact = 0
for i in range(len(categories)):
    for j in range(len(test_labels)):
        actual, predicted = rankIndicator(test_labels[j], res[j], i)
        if predicted:
            tact += 1
            if actual:
                tp += 1
recall += tp / tact
print ("Micro-Averaged Recall:  %.2f%%" % (recall * 100))  

#Micro-Averaged F1-Measure
f1 = 0
tp = 0
tact = 0
labs = 0
for i in range(len(categories)):
    for j in range(len(test_labels)):
        actual, predicted = rankIndicator(test_labels[j], res[j], i)
        if actual:
            labs += 1
        if predicted:
            tact += 1
            if actual:
                tp += 1
f1 += 2 * tp / (tact + labs)
print ("Micro-Averaged F1-Measure:  %.2f%%" % (f1 * 100))  

#One error
o_err = 0
for i in range(len(test_labels)):
    list = [(0,0) for i in range(len(categories))]
    for j in range(len(categories)):
        list[j] = (test_labels[i][j], res[i][j])
    list.sort(key=getPrediction, reverse=True)
    if list[0][0] == 0:
        o_err += 1
print ("One Error: %.2f%%" % (o_err / len(test_labels) * 100))

#Coverage
stepsDown = 0
for i in range(len(test_labels)):
    bound = sum(test_labels[i]) - 1
    list = [(0,0,0) for i in range(len(categories))]
    for j in range(len(categories)):
        list[j] = (test_labels[i][j], res[i][j], j)
    list.sort(key=getPrediction, reverse=True)
    eSteps = 0
    for j in range(len(categories)):
        if test_labels[i][j] == 0:
            continue
        for k in range(len(list)):
            if list[k][2] == j:
                eSteps = max(eSteps, k)
    stepsDown += max(0, eSteps - bound)
print ("Coverage: %.2f" % (stepsDown / len(test_labels))) 

#Ranking Loss
rl = 0
for i in range(len(test_labels)):
    mult = sum(test_labels[i])
    list = [(0,0) for i in range(len(categories))]
    wrongOrder = 0
    for j in range(len(categories)):
        list[j] = (test_labels[i][j], res[i][j])
    list.sort(key=getPrediction, reverse=True)
    for j in range(len(list)):
        if list[j][0] == 1:
            mult -= 1
            if mult == 0:
                break
            continue
        wrongOrder += mult
    rl += wrongOrder / sum(test_labels[i])
print ("Ranking Loss: %.2f" % (rl / len(test_labels)))    

Dataset containing 8836 documents predicted in less than 1 sec

Exact Match Ratio:  60.19%
Accuracy:  69.23%
Precision:  71.37%
Recall:  76.64%
F1-Measure:  72.38%
Hamming Loss:  1.52%
Macro-Averaged Precision:  61.20%
Macro-Averaged Recall:  80.27%
Macro-Averaged F1-Measure:  67.52%
Micro-Averaged Precision:  65.52%
Micro-Averaged Recall:  82.28%
Micro-Averaged F1-Measure:  72.95%
One Error: 19.39%
Coverage: 0.76
Ranking Loss: 0.61
