#### Loading input data

In [1]:
import numpy
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec
#from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from gensim.models.fasttext import FastText
from collections import OrderedDict
import pickle as pkl
from keras.preprocessing import sequence
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple
from random import shuffle
from keras.models import load_model
import random
import gzip
import glob
import os
import datetime
from pathlib import Path

homePath = str(Path.home()) + "/MLClassification/example/data/"
LabeledSentence = gensim.models.doc2vec.TaggedDocument
n_dim = 100
n_cats = 4
balance = 0
modelType = 2
wvModelType = 'c-fasttext'
batch_size = 128
numpy.random.seed = 7
dataset_path= homePath + 'docs'
best_models_path = homePath + 'bestModels/'
wordVecType = 0
trainPart = 0.8
testPart = 0.2
valPart = 0.15
cnt = 0
fCats = [0] * n_cats
nCats = []

LabeledDocument = namedtuple('LabeledDocument', 'words tags docClass docCategory')

def prepareDocsData(path):
    global cnt
    global lang
    wolabPath = homePath + 'wolab.txt'
    tmpf = open(wolabPath, 'w', encoding='UTF-8')
    docs = []
    trainCount = 0
    testCount = 0
    wCount = 0
    fCount = 0
    wordInDocs = dict()
    currdir = os.getcwd()
    os.chdir(path)
    for ff in glob.glob("*"):
        if ff[-3:] == 'txt':
            continue
        #print (ff)
        cnt = cnt + 1;
        tf = path + "/" + ff + "/tocs.txt"
        with open(tf, 'r', encoding='UTF-8') as f:
            docsCount = 0 
            for line in f:
                if line[:1] == '#':
                    if (balance > 0 and docsCount >= balance):
                        break;
                    docsCount = docsCount + 1
        f.close()
        fCats[cnt - 1] = docsCount
        nCats.append(ff)
        print (cnt, ff, docsCount)
        with open(tf, 'r', encoding='UTF-8') as f:
            dCount = 0
            wordInDoc = dict()
            for line in f:
                if line[:1] == '#':
                    if (dCount <= docsCount):
                        dCount = dCount + 1
                        continue
                    else:
                        break;
                if dCount <= int(docsCount * trainPart):
                    group = 'train'
                    trainCount = trainCount + 1
                    fCount = trainCount
                elif dCount <= int(docsCount * (trainPart + testPart)):
                    group = 'tests'
                    testCount = testCount + 1
                    fCount = testCount
                else:
                    group = 'wolab'
                    wCount = wCount + 1
                    fCount = wCount
                    tmpf.write(str(cnt-1) + "\t" + line.strip() + "\n")
                tag = '%s_%d'%(group, fCount)
                if group != 'wolab':
                    words = line.strip().split()
                    if len(words) < 5:
                        print("Attention! Doc %d from category %s has only %d tokens."%(dCount, ff, len(words)))
                    docs.append(LabeledDocument(words, [tag], ff, cnt-1))
                    if group == 'train':
                        for w in words:
                            if w not in wordInDoc:
                                wordInDoc[w] = 1
                                if w not in wordInDocs:
                                    wordInDocs[w] = 1
                                else:
                                    wordInDocs[w] = wordInDocs[w] + 1

        f.close()
    tmpf.close()
    return docs, wordInDocs

ds = datetime.datetime.now()
allDocs, wordInDocs = prepareDocsData(dataset_path)
maxDocLen = max(len(x.words) for x in allDocs)
minDocLen = min(len(x.words) for x in allDocs)
trainAllDocs = [doc for doc in allDocs if doc.tags[0][:5] == 'train']
trainAllDocs = random.sample(trainAllDocs, len(trainAllDocs))
trainDocs = trainAllDocs[:int(len(trainAllDocs) * (1 - valPart))]
valDocs = trainAllDocs[int(len(trainAllDocs) * (1 - valPart)):]

testDocs = [doc for doc in allDocs if doc.tags[0][:5] == 'tests']
testDocs = random.sample(testDocs, len(testDocs))
de = datetime.datetime.now()

print ("Load input data in %d sec"%((de-ds).total_seconds())) 
print('%d docs: %d train, %d validation, %d test' % (len(allDocs), len(trainDocs), len(valDocs), len(testDocs)))
print("Documents length: maximum: %d, minimum: %d"%(maxDocLen, minDocLen))
del allDocs
del trainAllDocs
print ("Continue")

Using TensorFlow backend.


1 Politics 9703
2 Sport 9866
3 Culture 8076
4 Economy 10090
Load input data in 12 sec
37735 docs: 25658 train, 4528 validation, 7549 test
Documents length: maximum: 6170, minimum: 5
Continue


#### Load W2V model

In [2]:
ds = datetime.datetime.now()
w2v = gensim.models.KeyedVectors.load_word2vec_format(homePath + 'wiki_ar.vec')
de = datetime.datetime.now()
print ("Load W2V model (%s) in %s sec"%(wvModelType, (de-ds).seconds)) 
print ("Continue")    

Load W2V model (c-fasttext) in 55 sec
Continue


#### Create classification models

In [3]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Flatten
from keras.layers import Conv1D
from keras.layers import Conv2D
from keras.layers import MaxPooling1D
from keras.layers import MaxPooling2D
from keras.layers import GlobalMaxPooling1D
from keras.layers import Activation
from keras.layers import Reshape
from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Bidirectional
from keras.layers import Concatenate
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras import optimizers
import math

tmpCount = 0

def getWordVector(tokens, modelType, wordInDocs, wvModel):
    global n_dim
    global tmpCount
    global wordVecType
    sdict = dict()
    tmpCount = tmpCount + 1
    if tmpCount != 0 and tmpCount%1000 == 0:
        print("Load ", tmpCount, end="\r")
        
    if wordVecType == 2:
        firstStep = True;
        vec = numpy.zeros(n_dim)
        v2 = numpy.zeros(n_dim)
        for word in tokens:
            try:
                if firstStep == True:
                    vec = wvModel[word]
                    v2 = wvModel[word]
                    firstStep = False
                else:
                    vec = numpy.minimum(vec, wvModel[word])
                    v2 = numpy.maximum(v2, wvModel[word])
            except KeyError:
                continue
        dem = len(vec)
        vec = numpy.concatenate((vec,v2)).reshape(1, dem*2)
        return vec
        
    vec = numpy.zeros(n_dim).reshape((1, n_dim))
    count = 0.
    if wordVecType == 1:
        for word in tokens:
            if word not in sdict:
                sdict[word] = 1
            else:
                sdict[word] = sdict[word] + 1
    for word in tokens:
        try:
            vec += wvModel[word].reshape((1, n_dim)) 
            count += 1.
        except KeyError:
            continue
    if wordVecType < 2 and count != 0:
        vec /= count
    return vec

# create the model
def getModel(modelType, n_dim):
    global batch_size
    print("Model type: %d"%(modelType))
    model = Sequential()
    if modelType == 0:
        print ("Dence model")
        model.add(Dense(256, activation='relu', input_dim=n_dim))
        model.add(Dropout(0.2))
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(n_cats, activation='softmax'))
    elif modelType == 1:
        print("RNN model")
        model.add(Reshape((1, n_dim), input_shape=(n_dim,)))
        model.add(LSTM(200,  input_shape=(1, n_dim),  return_sequences=True))
        model.add(Dropout(0.2))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(n_cats, activation='softmax'))
    elif modelType == 2:
        print("CNN model")
        model.add(Reshape((1, n_dim), input_shape=(n_dim,)))
        model.add(Conv1D(input_shape=(1, n_dim),
                        filters=100,
                        kernel_size=1,
                        padding="valid",
                        activation="relu"))
        model.add(MaxPooling1D(1))
        model.add(Flatten())        
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(n_cats, activation='softmax'))
        
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    #model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    #model.compile(loss='sparse_categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model    

print ("Continue")

Continue


#### Prepare data for ML

In [None]:
ds = datetime.datetime.now()
train_arrays = numpy.concatenate([getWordVector(x.words, modelType, wordInDocs, w2v) for x in trainDocs])
val_arrays = numpy.concatenate([getWordVector(x.words, modelType, wordInDocs, w2v) for x in valDocs])
train_labels = numpy.array([y.docCategory for y in trainDocs])
val_labels = numpy.array([y.docCategory for y in valDocs])

#print (len(train_arrays), train_arrays.shape)
tmpCount = 0

de = datetime.datetime.now()
print ("Prepare train and validation data in %d sec"%((de-ds).total_seconds()))
ds = datetime.datetime.now()

test_arrays = numpy.concatenate([getWordVector(x.words, modelType, wordInDocs, w2v) for x in testDocs])
test_labels = numpy.array([y.docCategory for y in testDocs])

de = datetime.datetime.now()
print ("Prepare test data in %d sec"%((de-ds).total_seconds()))

print ("train data size: ", len(train_arrays))
print ("test data size: ", len(test_arrays))
print ("validation data size: ", len(val_arrays))

print ("Continue")

Load  2000

#### Train and test

In [None]:
dms = n_dim
if wordVecType == 2:
    dms *= 2
model = getModel(modelType, dms)
eps = [20, 20, 20]
checkpoint = ModelCheckpoint(best_models_path + 'curModel%d.hdf5'%(modelType), monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
#print (model.summary())
#print (model.metrics_names)

model.fit(train_arrays, train_labels, epochs=eps[modelType], validation_data=(val_arrays, val_labels), batch_size=batch_size, verbose=1, callbacks=[checkpoint], shuffle=False)

# Final evaluation of the model
scores = model.evaluate(test_arrays, test_labels, verbose=1)
print("Final model accuracy: %.2f%%" % (scores[1]*100))

model = load_model(best_models_path + 'curModel%d.hdf5'%(modelType))
scores = model.evaluate(test_arrays, test_labels, verbose=1)
print("Best model accuracy: %.2f%%" % (scores[1]*100))

#### Analysis

In [None]:
count = 0
correct = 0
arrCats = [0] * n_cats
chCats = [0] * n_cats
zCats = [[0 for x in range(n_cats)] for y in range(n_cats)] 
tpCats = [[[0 for z in range(4)] for y in range(n_cats)] for x in range(3)]
#print(tpCats.shape)

for i in range(len(test_arrays)):
    count = i + 1
    vec = test_arrays[i].reshape(1,100)
    category = test_labels[i]
    a = model.predict_classes((vec))
    chCats[category] = chCats[category] + 1
    if int(a[0]) == category:
        correct = correct +1
    else:
        arrCats[category] = arrCats[category] + 1
        zCats[category][int(a[0])] = zCats[category][int(a[0])] + 1 
            
print("DL Accuracy: %.2f%%" % (correct/count*100))
print("Results of prediction docs by category:")
print("\n%2s %15s %9s %8s %6s  %7s  %8s    %5s    %5s    %3s"%("N","Category","At all","In test","TP","FN","FP","Recall","Precis.","F1"))
print("=============================================================================================")
for i in range(n_cats):
    fp = 0
    for j in range(n_cats):
        if j == i:
            continue
        fp = fp + zCats[j][i]
    recall = (chCats[i] - arrCats[i])/chCats[i]*100
    precision = (chCats[i] - arrCats[i])/(chCats[i] - arrCats[i] + fp)*100
    f1 = 2*(recall * precision) / (recall + precision)
    print("%2d %15s %8d %8d %8d %8d %8d    %.2f%%    %.2f%%    %.2f%%"%(i+1, nCats[i], fCats[i], chCats[i], 
          chCats[i] - arrCats[i], arrCats[i], fp, recall, precision, f1)) 
 
print ("\nEnd")