In [145]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
 
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [146]:
df = pd.read_pickle("clean_train_abstract")

In [147]:
df = df.drop(columns='raw_abstract')

In [148]:
df.head()

Unnamed: 0,ISSN,date,topic,title,clean_train_abstract
0,1545-5963,2018/05,IEEE/ACM Transactions on Computational Biology...,Simulating the Large-Scale Erosion of Genomic ...,dramat decreas cost dna sequenc trigger millio...
1,1545-5963,2018/05,IEEE/ACM Transactions on Computational Biology...,Protecting Privacy and Security of Genomic Dat...,use patient health record provid tremend benef...
2,1545-5963,2018/05,IEEE/ACM Transactions on Computational Biology...,Implementation and Evaluation of an Algorithm ...,improv qualiti cryptograph privaci preserv gen...
3,1545-5963,2018/05,IEEE/ACM Transactions on Computational Biology...,Identification and Analysis of Key Residues in...,protein rna complex play import role various b...
4,1545-5963,2018/05,IEEE/ACM Transactions on Computational Biology...,Unified Deep Learning Architecture for Modelin...,predict spatial structur function biolog macro...


In [149]:
sample = df.iloc[random.sample(range(0, len(df)), 10000)]

In [150]:
sample.index = range(len(sample.index))
sample.head()

Unnamed: 0,ISSN,date,topic,title,clean_train_abstract
0,1063-8210,2018/08,IEEE Transactions on Very Large Scale Integrat...,Reducing Rollback Cost in VLSI Circuits to Imp...,nanomet technolog circuit sensit various kind ...
1,0018-9340,1970/03,IEEE Transactions on Computers,IEEE Computer Group,
2,2377-3782,2017/02,IEEE Transactions on Sustainable Computing,ESCAPE: Effective Scalable Clustering Approach...,massiv data set continu posit base queri cpqs ...
3,1063-8210,2016/04,IEEE Transactions on Very Large Scale Integrat...,Runtime tunable transmitting power technique i...,emerg chip communic technolog like wireless ne...
4,1536-1233,2005/06,IEEE Transactions on Mobile Computing,Dynamical Mobile Terminal Location Registratio...,paper propos mobil termin mt locat registr upd...


In [151]:
indexes = []
for x in range(0,len(sample)):
    if(len(sample['clean_train_abstract'][x])<100):
        indexes.append(x)

In [152]:
sample = sample.drop(indexes)
sample.index = range(len(sample.index))

In [157]:
sample.head()
sample.shape

(9238, 5)

In [156]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
 
data_words = list(sent_to_words(sample['clean_train_abstract']))
print(data_words[:1])

[['nanomet', 'technolog', 'circuit', 'sensit', 'various', 'kind', 'perturb', 'alpha', 'particl', 'atmospher', 'neutron', 'induc', 'singl', 'event', 'upset', 'affect', 'memori', 'cell', 'latch', 'flip', 'flop', 'also', 'induc', 'singl', 'event', 'transient', 'initi', 'combin', 'logic', 'captur', 'latch', 'flip', 'flop', 'associ', 'output', 'logic', 'past', 'major', 'effort', 'relat', 'memori', 'howev', 'whole', 'situat', 'get', 'wors', 'solut', 'protect', 'entir', 'design', 'mandatori', 'solut', 'detect', 'error', 'logic', 'function', 'alreadi', 'exist', 'solut', 'allow', 'correct', 'lead', 'lot', 'hardwar', 'overhead', 'nonprocessor', 'design', 'paper', 'present', 'novel', 'techniqu', 'includ', 'sever', 'hardwar', 'architectur', 'algorithm', 'implement', 'reduc', 'cost', 'rollback', 'kind', 'circuit']]


In [158]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
 
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
 
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

data_words_bigrams = make_bigrams(data_words)

#바이어그램 적용 결과 예시
print(data_words[0][:10])
print(data_words_bigrams[0][:10])



['nanomet', 'technolog', 'circuit', 'sensit', 'various', 'kind', 'perturb', 'alpha', 'particl', 'atmospher']
['nanomet', 'technolog', 'circuit', 'sensit', 'various', 'kind', 'perturb', 'alpha', 'particl', 'atmospher']


In [159]:
#토픽 모델링 적용을 위해 숫자로 바꿈
#단어를 숫자에 대응시켜 dictionary 만듦
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams
corpus = [id2word.doc2bow(text) for text in texts]

[(id2word[id], freq) for id, freq in corpus[0]][:5]

[('affect', 1), ('algorithm', 1), ('allow', 1), ('alpha', 1), ('alreadi', 1)]

In [211]:
#abstract 전체를 대상으로 lda_model 만들기
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=22,
random_state=100,
update_every=1,
chunksize=100,
passes=10,
alpha='auto',
per_word_topics=True)

In [212]:
topicTable = pd.DataFrame([id2word[word[0]] for word in lda_model.get_topic_terms(idx)] for idx in range(22))
topicTable_prob = pd.DataFrame([(id2word[word[0]],word[1]) for word in lda_model.get_topic_terms(idx)] for idx in range(22))

In [213]:
topicTable_prob

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,"(content, 0.03481627)","(devic, 0.029511115)","(secur, 0.027007883)","(core, 0.024588741)","(polici, 0.02296507)","(trace, 0.019776434)","(monitor, 0.015826534)","(densiti, 0.0151586365)","(protect, 0.012949498)","(internet, 0.012337405)"
1,"(network, 0.17591606)","(protocol, 0.054676317)","(rout, 0.034141757)","(throughput, 0.032034233)","(traffic, 0.030798348)","(delay, 0.02824819)","(packet, 0.027234191)","(path, 0.024248369)","(layer, 0.02190381)","(messag, 0.021781469)"
2,"(data, 0.08707803)","(perform, 0.033414133)","(effici, 0.03329617)","(high, 0.029266262)","(memori, 0.022525558)","(reduc, 0.02161271)","(increas, 0.021446768)","(larg, 0.02129235)","(applic, 0.021052113)","(cost, 0.019944394)"
3,"(servic, 0.038766667)","(queri, 0.033405185)","(access, 0.028149812)","(resourc, 0.024501778)","(provid, 0.02146917)","(user, 0.019265555)","(dynam, 0.019251298)","(server, 0.018944414)","(virtual, 0.017511986)","(manag, 0.01703152)"
4,"(user, 0.04478678)","(knowledg, 0.027454466)","(research, 0.026631985)","(context, 0.022028357)","(learn, 0.020627087)","(activ, 0.01696049)","(studi, 0.016699495)","(inform, 0.014762783)","(open, 0.013874373)","(experi, 0.012270857)"
5,"(model, 0.29688004)","(learn, 0.051753137)","(predict, 0.040991444)","(face, 0.038943272)","(reliabl, 0.031159632)","(train, 0.021099577)","(label, 0.019153398)","(gene, 0.016075937)","(dataset, 0.013402486)","(essenti, 0.011318889)"
6,"(time, 0.07371933)","(detect, 0.043278616)","(scheme, 0.034461334)","(error, 0.031906344)","(fault, 0.021139128)","(correct, 0.01665754)","(speed, 0.015875144)","(overhead, 0.014584105)","(failur, 0.01366584)","(number, 0.013306994)"
7,"(rate, 0.07788622)","(channel, 0.06444228)","(block, 0.03881254)","(cell, 0.036860466)","(stream, 0.036084086)","(mode, 0.020832857)","(queue, 0.020788398)","(tcp, 0.017152503)","(fair, 0.014451133)","(mac, 0.013070999)"
8,"(test, 0.19759233)","(sequenc, 0.062545426)","(generat, 0.031360477)","(compress, 0.024270274)","(scan, 0.02066949)","(conform, 0.015362364)","(tactil, 0.014935452)","(pattern, 0.01464765)","(list, 0.014456461)","(haptic, 0.0137091)"
9,"(use, 0.031002562)","(base, 0.029982818)","(result, 0.026144892)","(propos, 0.02569266)","(method, 0.02525819)","(approach, 0.019675432)","(two, 0.015292687)","(paper, 0.014179381)","(show, 0.013468007)","(featur, 0.012551581)"


In [214]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better.
 
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.588788657958044

Coherence Score:  0.39789103795694775


In [172]:
len(data_words)

9238

In [173]:
import re
from nltk.util import ngrams

In [189]:
realTopics = sample.topic
realTopics = realTopics.drop_duplicates()
realTopics.index = range(len(realTopics.index))

In [204]:
realTopics.head()

0    IEEE Transactions on Very Large Scale Integrat...
1           IEEE Transactions on Sustainable Computing
2                IEEE Transactions on Mobile Computing
3    IEEE Transactions on Visualization & Computer ...
4                       IEEE Transactions on Computers
Name: topic, dtype: object

In [207]:
realTopics[0][realTopics[0].find(' on ')+4:]

'Very Large Scale Integration (VLSI) Systems'

In [209]:
realTopics = pd.DataFrame([realTopics[idx][realTopics[idx].find(' on ')+4:] for idx in range(0,len(realTopics))])

In [210]:
realTopics

Unnamed: 0,0
0,Very Large Scale Integration (VLSI) Systems
1,Sustainable Computing
2,Mobile Computing
3,Visualization & Computer Graphics
4,Computers
5,Computational Biology and Bioinformatics
6,Networking
7,Pattern Analysis & Machine Intelligence
8,Knowledge & Data Engineering
9,Parallel & Distributed Systems


In [284]:
absTopicTable = pd.DataFrame(columns=[x for x in range(22)])
absTopicTable

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21


In [312]:
for idx in range(len(corpus)):
    if(idx%500==0):print(idx)
    for freq in lda_model.get_document_topics(corpus[idx]):
        absTopicTable.loc[idx,freq[0]] = freq[1]

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000


In [314]:
absTopicTable.to_pickle('absTopicTable.p')

In [316]:
absTopicTable.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.0264586,,0.0748074,,,,0.372525,0.0147686,,0.196492,...,0.0257186,0.0611155,,0.0288739,,,0.0857618,,,0.0554868
1,0.0406124,0.0401232,0.0867642,0.0914229,,,,,,0.289112,...,0.0590794,,,0.0124912,,0.0398823,0.0716657,0.0105233,0.0763858,0.135857
2,0.073272,0.0977627,0.166925,,,,0.0290835,,,0.0821929,...,0.0165454,0.124295,0.0146175,0.0113362,,0.01487,0.204606,,0.113584,0.0215074
3,,0.0216543,0.0205755,0.032888,0.0824149,0.0487345,0.0278294,0.0181438,,0.398592,...,0.0155759,,,0.0212335,0.0347793,,0.13304,,0.0149362,0.108836
4,,0.0174142,0.127386,,0.0284587,,0.0103823,0.0183275,0.0171053,0.257895,...,0.140703,,0.0994693,0.0196428,0.0809518,,0.0799719,,,0.0582868
5,,,0.0333037,0.0124767,0.132477,0.0255449,0.0349989,,,0.234932,...,0.276819,,,0.0184849,0.0138554,,0.0607862,,,0.0720462
6,0.0178618,0.0213088,0.0344995,0.0339309,0.0106329,0.0297278,0.262009,,0.0692466,0.194733,...,0.0271946,,,,,,0.161347,,,0.0666573
7,,,0.024404,,,0.0197542,0.01233,,,0.170276,...,0.0522799,,,0.0310403,0.215112,0.0128635,0.0791337,0.0261679,,0.311117
8,,0.22166,0.0409592,,0.0174041,0.0290938,0.144857,,,0.207307,...,0.0373832,,0.0203708,,,,0.141382,,0.0743958,0.021834
9,,0.0464807,0.269293,,0.082552,0.0177476,0.150841,,,0.151771,...,0.0213852,,0.0554789,0.0183208,,,0.0390572,,0.0838348,0.027986


In [326]:
absTopicTable = absTopicTable.fillna(0)

In [331]:
absTopicTable.iloc[absTopicTable.idxmax(),:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
40,0.375197,0.073147,0.035409,0.0179,0.025165,0.0,0.0,0.018816,0.0,0.176614,...,0.037485,0.0,0.025771,0.0,0.0,0.0,0.095696,0.0,0.0,0.07784
6625,0.0,0.383116,0.061484,0.0,0.012326,0.016436,0.010238,0.035815,0.0,0.247137,...,0.019718,0.0,0.0,0.0,0.0,0.0,0.099797,0.0,0.022556,0.051891
1171,0.0,0.0,0.446084,0.063783,0.019297,0.0,0.025862,0.0,0.0,0.10344,...,0.010759,0.131737,0.0,0.0,0.0,0.059923,0.039783,0.0,0.0,0.046884
245,0.0,0.0,0.01172,0.473698,0.0,0.025413,0.037175,0.043534,0.0,0.185767,...,0.019812,0.0,0.0,0.041333,0.0,0.0,0.069665,0.0,0.0,0.053426
29,0.036342,0.0,0.032795,0.030124,0.413048,0.014064,0.019756,0.0,0.0,0.112759,...,0.034028,0.0,0.067003,0.0,0.0,0.033007,0.110891,0.014198,0.043149,0.013451
67,0.0,0.0,0.03449,0.046809,0.017037,0.294829,0.017327,0.0,0.0,0.178831,...,0.084654,0.0,0.0,0.0,0.157225,0.0,0.069622,0.0,0.013923,0.054767
55,0.0,0.0,0.060383,0.0,0.0,0.0,0.443463,0.0,0.0,0.147419,...,0.039391,0.08899,0.0,0.03568,0.0,0.0,0.039375,0.0,0.0,0.069232
237,0.0,0.0,0.079475,0.0,0.049955,0.027489,0.069853,0.291619,0.0,0.186092,...,0.025925,0.0,0.018767,0.014375,0.0,0.0,0.050445,0.010466,0.0,0.140447
458,0.0,0.0,0.116526,0.0,0.0,0.0,0.019117,0.0,0.388594,0.10595,...,0.030963,0.0,0.0,0.092671,0.094215,0.0,0.052398,0.0,0.0,0.025902
2637,0.0,0.0,0.116948,0.0,0.022645,0.0,0.02155,0.0,0.0,0.671576,...,0.022991,0.0,0.0,0.0,0.0,0.0,0.02837,0.0,0.0,0.054736


In [337]:
sample.loc[absTopicTable.idxmax(),'topic']

40      IEEE Transactions on Dependable and Secure Com...
6625                  IEEE/ACM Transactions on Networking
1171    IEEE Transactions on Very Large Scale Integrat...
245     IEEE Transactions on Parallel & Distributed Sy...
29             IEEE Transactions on Learning Technologies
67      IEEE Transactions on Pattern Analysis & Machin...
55      IEEE Transactions on Very Large Scale Integrat...
237     IEEE Transactions on Parallel & Distributed Sy...
458     IEEE/ACM Transactions on Computational Biology...
2637    IEEE Transactions on Pattern Analysis & Machin...
4223    IEEE Transactions on Parallel & Distributed Sy...
5710                       IEEE Transactions on Computers
30      IEEE Transactions on Visualization & Computer ...
1158    IEEE Transactions on Very Large Scale Integrat...
5679    IEEE Transactions on Knowledge & Data Engineering
62                         IEEE Transactions on Computers
34      IEEE Transactions on Pattern Analysis & Machin...
87            

In [338]:
import networkx as nx
import matplotlib.pyplot as plt

In [346]:
from gensim.models import Word2Vec
embedding_model = Word2Vec(data_words, size=100, window = 2, min_count=50, workers=4, iter=100, sg=1)

In [354]:
topicTable.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,content,devic,secur,core,polici,trace,monitor,densiti,protect,internet
1,network,protocol,rout,throughput,traffic,delay,packet,path,layer,messag
2,data,perform,effici,high,memori,reduc,increas,larg,applic,cost
3,servic,queri,access,resourc,provid,user,dynam,server,virtual,manag
4,user,knowledg,research,context,learn,activ,studi,inform,open,experi


In [353]:
# check embedding result
print(embedding_model.most_similar(positive=["data"], topn=10))

[('stream', 0.440653532743454), ('dataset', 0.43918657302856445), ('inform', 0.43144863843917847), ('video', 0.39131081104278564), ('massiv', 0.38950109481811523), ('imag', 0.38423216342926025), ('collect', 0.37133708596229553), ('set', 0.36159753799438477), ('databas', 0.3574773073196411), ('larg', 0.35470113158226013)]
