In [1]:
import smart_open
import gensim
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics

In [2]:
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [3]:
# 50000_WoS.txt
# 50000_MedLine.txt
# 56529_CV.txt
train_corpus = list(read_corpus('../datasets/50000_MedLine.txt'))

In [4]:
len(train_corpus)

50000

In [5]:
def get_ground_truth_label():
    ground_truth_label = []
    # 50000_WoS_WC.txt
    # 50000_MedLine_Label.txt
    # 56529_CV_Label.txt
    with open('../datasets/50000_MedLine_Label.txt','r',encoding = 'utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = int(line.replace('\n',''))
            ground_truth_label.append(line)
    return ground_truth_label
ground_truth_label = get_ground_truth_label()

In [7]:
model = gensim.models.doc2vec.Doc2Vec(dm=0, vector_size = 300, min_count = 5)
# If dm=1, distributed memory (PV-DM) is used.Otherwise, distributed bag of words (PV-DBOW) is employed
# default: 100, 5, 5
model.build_vocab(train_corpus)

In [6]:
def evaluate(n_epoch,ground_truth_label):
    docvecs = model.dv
    docvectors = []
    for i in range(len(docvecs)):
        docvectors.append(docvecs[i])
    classNumber = 10
    kmean_model = KMeans(n_clusters = classNumber).fit(docvectors)
    labels = kmean_model.labels_
    print('n_epoch:{}，ARI:{:.4f}, FM:{:04f}, AMI:{:04f}'.format(n_epoch, metrics.adjusted_rand_score(labels, ground_truth_label),
                                                        metrics.fowlkes_mallows_score(labels, ground_truth_label),
                                                        metrics.adjusted_mutual_info_score(labels, ground_truth_label)))

In [8]:
for i in range(1,2):
    n_epoch = 5*(i+1)
    model.train(train_corpus, total_examples = model.corpus_count, epochs = model.epochs)
    evaluate(n_epoch, ground_truth_label)

n_epoch:5，ARI:0.3679, FM:0.432605, AMI:0.482681
n_epoch:10，ARI:0.4158, FM:0.474915, AMI:0.510372
