In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

In [2]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [3]:
# spacy for lemmatization
import spacy

In [4]:
 # Plotting tools
import pyLDAvis
import pyLDAvis.gensim # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
 
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [7]:
df = pd.read_pickle("clean_train_abstract")

In [10]:
df.head()

Unnamed: 0,ISSN,date,topic,title,raw_abstract,clean_train_abstract
0,1545-5963,2018/05,IEEE/ACM Transactions on Computational Biology...,Simulating the Large-Scale Erosion of Genomic ...,The dramatically decreasing costs of DNA seque...,dramat decreas cost dna sequenc trigger millio...
1,1545-5963,2018/05,IEEE/ACM Transactions on Computational Biology...,Protecting Privacy and Security of Genomic Dat...,Re-use of patients’ health records can provide...,use patient health record provid tremend benef...
2,1545-5963,2018/05,IEEE/ACM Transactions on Computational Biology...,Implementation and Evaluation of an Algorithm ...,We improve the quality of cryptographically pr...,improv qualiti cryptograph privaci preserv gen...
3,1545-5963,2018/05,IEEE/ACM Transactions on Computational Biology...,Identification and Analysis of Key Residues in...,Protein–RNA complexes play important roles in ...,protein rna complex play import role various b...
4,1545-5963,2018/05,IEEE/ACM Transactions on Computational Biology...,Unified Deep Learning Architecture for Modelin...,Prediction of the spatial structure or functio...,predict spatial structur function biolog macro...


In [20]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations

data_words = list(sent_to_words(df['clean_train_abstract']))

In [21]:
print(data_words[:1])

[['dramat', 'decreas', 'cost', 'dna', 'sequenc', 'trigger', 'million', 'human', 'genotyp', 'sequenc', 'moreov', 'individu', 'increas', 'make', 'genom', 'data', 'public', 'avail', 'therebi', 'creat', 'privaci', 'threat', 'relat', 'dna', 'similar', 'general', 'entiti', 'gain', 'access', 'signific', 'fraction', 'sequenc', 'genotyp', 'might', 'abl', 'infer', 'even', 'genom', 'unsequenc', 'individu', 'paper', 'propos', 'simul', 'base', 'model', 'quantifi', 'impact', 'continu', 'sequenc', 'public', 'person', 'genom', 'data', 'popul', 'genom', 'privaci', 'simul', 'probabilist', 'model', 'data', 'share', 'take', 'account', 'event', 'migrat', 'interraci', 'mate', 'exemplarili', 'instanti', 'simul', 'sampl', 'popul', 'individu', 'evalu', 'privaci', 'multipl', 'set', 'genom', 'variant', 'subset', 'phenotyp', 'relat', 'variant', 'find', 'demonstr', 'increas', 'share', 'rate', 'futur', 'entail', 'substanti', 'negat', 'effect', 'privaci', 'older', 'generat', 'moreov', 'find', 'mix', 'popul', 'face',

In [22]:
from gensim.models import Word2Vec

In [23]:
embedding_model = Word2Vec(data_words, size=100, window = 2, min_count=50, workers=4, iter=100, sg=1)

In [30]:
# check embedding result
print(embedding_model.most_similar(positive=[""], topn=100))

[('gpus', 0.5867547988891602), ('render', 0.5313080549240112), ('interact', 0.5258808135986328), ('gpu', 0.5159129500389099), ('nvidia', 0.5103408694267273), ('interfac', 0.4891064167022705), ('acceler', 0.47153136134147644), ('anim', 0.4702153503894806), ('purpos', 0.46479976177215576), ('cuda', 0.4600568413734436), ('visual', 0.4578077793121338), ('languag', 0.4531848728656769), ('intel', 0.45139074325561523), ('haptic', 0.4445781111717224), ('syntax', 0.4445060193538666), ('sketch', 0.4443061351776123), ('microcontrol', 0.4439314007759094), ('geometri', 0.44213244318962097), ('vision', 0.43814918398857117), ('edit', 0.4366540312767029), ('coprocessor', 0.42888712882995605), ('descript', 0.4280271828174591), ('gpgpu', 0.4226806163787842), ('isosurfac', 0.422457754611969), ('tool', 0.42234423756599426), ('hull', 0.4191679060459137), ('display', 0.41795480251312256), ('interpret', 0.41711103916168213), ('cpus', 0.4127808213233948), ('mathemat', 0.4113714098930359), ('polyhedr', 0.40986

In [37]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
 
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
 
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



['dramat', 'decreas', 'cost', 'dna', 'sequenc', 'trigger', 'million', 'human', 'genotyp', 'sequenc', 'moreov', 'individu', 'increas', 'make', 'genom', 'data', 'public', 'avail', 'therebi', 'creat', 'privaci', 'threat', 'relat', 'dna', 'similar', 'general', 'entiti', 'gain', 'access', 'signific', 'fraction', 'sequenc', 'genotyp', 'might', 'abl', 'infer', 'even', 'genom', 'unsequenc', 'individu', 'paper', 'propos', 'simul', 'base', 'model', 'quantifi', 'impact', 'continu', 'sequenc', 'public', 'person', 'genom', 'data', 'popul', 'genom', 'privaci', 'simul', 'probabilist', 'model', 'data', 'share', 'take_account', 'event', 'migrat', 'interraci', 'mate', 'exemplarili', 'instanti', 'simul', 'sampl', 'popul', 'individu', 'evalu', 'privaci', 'multipl', 'set', 'genom', 'variant', 'subset', 'phenotyp', 'relat', 'variant', 'find', 'demonstr', 'increas', 'share', 'rate', 'futur', 'entail', 'substanti', 'negat', 'effect', 'privaci', 'older', 'generat', 'moreov', 'find', 'mix', 'popul', 'face', 'le

In [38]:
def make_bigrams(texts)
    return [bigram_mod[doc] for doc in texts]

data_words_bigrams = make_bigrams(data_words)

In [43]:
#바이어그램 적용 결과 예시
k = 100
print(data_words[k][:10])
print(data_words_bigrams[k][:10])

['genom', 'rearrang', 'larg', 'scale', 'mutat', 'event', 'affect', 'genom', 'evolutionari', 'process']
['genom_rearrang', 'larg', 'scale', 'mutat', 'event', 'affect', 'genom', 'evolutionari', 'process', 'therefor']


In [46]:
#토픽 모델링 적용을 위해 숫자로 바꿈
#단어를 숫자에 대응시켜 dictionary 만듦
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams
corpus = [id2word.doc2bow(text) for text in texts]

[(id2word[id], freq) for id, freq in corpus[0]][:5]

[('abl', 1), ('access', 1), ('avail', 1), ('base', 1), ('continu', 1)]

In [47]:
#abstract 전체를 대상으로 lda_model 만들기
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=20,
random_state=100,
update_every=1,
chunksize=100,
passes=10,
alpha='auto',
per_word_topics=True)

In [48]:
topicTable = pd.DataFrame([id2word[word[0]] for word in lda_model.get_topic_terms(idx)] for idx in range(20))
topicTable_prob = pd.DataFrame([(id2word[word[0]],word[1]) for word in lda_model.get_topic_terms(idx)] for idx in range(20))

In [50]:
topicTable_prob

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,"(data, 0.24693504)","(render, 0.07657246)","(structur, 0.0713877)","(domain, 0.03217863)","(cluster, 0.02537629)","(dataset, 0.022839922)","(reconstruct, 0.019156948)","(hierarch, 0.015548254)","(similar, 0.014709566)","(attribut, 0.014619321)"
1,"(display, 0.1616357)","(human, 0.07602927)","(synthesi, 0.047878165)","(logic, 0.046052214)","(static, 0.039378047)","(bodi, 0.030692423)","(medic, 0.029714752)","(shade, 0.028750034)","(collabor, 0.026531968)","(articl, 0.014919284)"
2,"(composit, 0.04300392)","(updat, 0.037168194)","(immers, 0.034504827)","(databas, 0.034269433)","(document, 0.031147612)","(glyph, 0.025361251)","(instruct, 0.02417663)","(see, 0.023994956)","(mobil, 0.023375288)","(semi, 0.020600192)"
3,"(visual, 0.28565252)","(featur, 0.06029193)","(test, 0.036932517)","(represent, 0.031579453)","(vector, 0.030290218)","(pattern, 0.028113522)","(generat, 0.02637206)","(extract, 0.025023451)","(filter, 0.02151091)","(volumetr, 0.01757125)"
4,"(isosurfac, 0.08331481)","(tree, 0.08054478)","(vertic, 0.045243684)","(list, 0.03908318)","(issu, 0.030292017)","(author, 0.028158303)","(ieee, 0.022650924)","(period, 0.02215208)","(cloud, 0.021650473)","(contain, 0.020474004)"
5,"(materi, 0.04897992)","(haptic, 0.030406306)","(trajectori, 0.028084265)","(multiresolut, 0.024394928)","(virtual_realiti, 0.022742517)","(tabl, 0.022321975)","(person, 0.022226788)","(tissu, 0.01818986)","(veloc, 0.0169906)","(vortic, 0.016764289)"
6,"(volum_render, 0.07063474)","(ray, 0.05700023)","(voxel, 0.04161205)","(weight, 0.032684617)","(label, 0.032381658)","(align, 0.027080134)","(peopl, 0.022615597)","(kernel, 0.020734439)","(vortex, 0.020473158)","(littl, 0.019838577)"
7,"(imag, 0.15067175)","(textur, 0.06519218)","(cell, 0.04413175)","(scene, 0.037653793)","(video, 0.03751254)","(depth, 0.026061635)","(camera, 0.025400614)","(hierarchi, 0.024610557)","(visibl, 0.023791594)","(gate, 0.02209899)"
8,"(method, 0.030898249)","(use, 0.02686953)","(algorithm, 0.023847872)","(present, 0.018573806)","(surfac, 0.018274857)","(comput, 0.014075893)","(set, 0.013294286)","(new, 0.0130455)","(result, 0.013034935)","(two, 0.012863209)"
9,"(use, 0.02757752)","(interact, 0.026523916)","(system, 0.026462475)","(approach, 0.017219173)","(design, 0.017160028)","(analysi, 0.016478334)","(object, 0.015410887)","(provid, 0.013936304)","(inform, 0.013679155)","(differ, 0.013434689)"


In [56]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better.
 
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.92932842960386

Coherence Score:  0.31965734368948673


In [64]:
print(embedding_model.most_similar(positive=["imag"], topn=5))

[('scene', 0.6992356777191162), ('textur', 0.6459394693374634), ('pictur', 0.6353278160095215), ('shape', 0.6335362195968628), ('video', 0.5849931240081787)]
