In [None]:
# The word vector is weighted and averaged by tf-idf to obtain the sentence vector

# Simple data pre-processing
from gensim import utils
def read_corpus(fname):
    for line in open(fname,encoding = 'utf-8'):
        yield utils.simple_preprocess(line)

# 50000_WoS.txt
# 50000_MedLine.txt
train_corpus = list(read_corpus('../datasets/50000_MedLine.txt'))
len(train_corpus)

In [None]:
from gensim.models import word2vec
import numpy as np 

model = word2vec.Word2Vec(sentences = train_corpus, vector_size=300, min_count = 5, epochs = 50)   # WoS:100 Med:50
word_list = model.wv.index_to_key            #len(word_list) = 30064
vectors = np.asarray(model.wv.vectors)      #len(vectors) = 30064

In [None]:
print(word_list[:10])
print(len(word_list))
#print(model.wv['the'])

In [None]:
# Rejoin the split words into sentences as input for tf-idf
from tqdm import tqdm
sentences = [[]*len(train_corpus) for x in range(len(train_corpus))]
for i in tqdm(range(len(train_corpus))):
    sentences[i] = ' '.join(train_corpus[i])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_model = TfidfVectorizer() 
tfidf_matrix = tfidf_model.fit_transform(sentences).toarray()
tfidf_vocabulary = tfidf_model.vocabulary_

In [None]:
import numpy as np
word_vec_per_doc = [[]*len(train_corpus) for x in range(len(train_corpus))]
doc_vector = [[]*len(train_corpus) for x in range(len(train_corpus))]
for docid in tqdm(range(len(train_corpus))):
    for wordid in range(len(train_corpus[docid])):  
        if (train_corpus[docid][wordid] in word_list):  
            word_tfidf = tfidf_matrix[docid][tfidf_vocabulary[train_corpus[docid][wordid]]]   # Acquire the tf-idf value of words
            word_vec_per_doc[docid].append(model.wv[train_corpus[docid][wordid]]*word_tfidf)   # Acquire doc vector
    doc_vector[docid] = np.array(word_vec_per_doc[docid]).mean(axis = 0)

In [None]:
len(doc_vector)

In [None]:
from sklearn.cluster import KMeans
classNumber = 10
kmean_model = KMeans(n_clusters = classNumber).fit(doc_vector)
labels = kmean_model.labels_

from collections import Counter
center_dict = Counter(labels)
center_dict

In [None]:
labels[:10]

In [None]:
def get_ground_truth_label():
    ground_truth_label = []
    # 50000_WoS_WC.txt
    # 50000_MedLine_Label.txt
    with open('../datasets/50000_MedLine_Label.txt','r',encoding = 'utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = int(line.replace('\n',''))
            ground_truth_label.append(line)
    return ground_truth_label
ground_truth_label = get_ground_truth_label()

In [None]:
from sklearn import metrics
print(metrics.adjusted_rand_score(labels, ground_truth_label))
print(metrics.fowlkes_mallows_score(labels, ground_truth_label))
print(metrics.adjusted_mutual_info_score(labels, ground_truth_label))