In [1]:
import pandas as pd
import numpy as np
from itertools import islice

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [2]:
select_year = 2017
input = './data/out/%dpreprocess.csv.xz' % select_year

In [3]:
df = pd.read_csv(input, compression='xz')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3055 entries, 0 to 3054
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   titulo  3055 non-null   object
dtypes: object(1)
memory usage: 24.0+ KB


In [4]:
stemmer = SnowballStemmer("english")
df['titulo'] = df['titulo'].map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))

In [5]:
# Initialize the vectorizer with new settings and check the new vocabulary length
cvec = CountVectorizer(stop_words='english', min_df=.0010, max_df=.5, ngram_range=(1,2))
cvec.fit(df['titulo'])
list(islice(cvec.vocabulary_.items(), 20))
#len(cvec.vocabulary_)

[('exploit', 1493),
 ('photo', 2942),
 ('locat', 2247),
 ('direct', 1194),
 ('bas', 393),
 ('discoveri', 1205),
 ('gold', 1744),
 ('standard', 3753),
 ('social', 3652),
 ('media', 2372),
 ('corpus', 918),
 ('urban', 4130),
 ('issu', 2097),
 ('framework', 1644),
 ('spatial', 3708),
 ('analyt', 189),
 ('use', 4139),
 ('heterogen', 1822),
 ('data', 1005),
 ('sourc', 3690)]

In [6]:
len(cvec.vocabulary_)

4360

In [7]:
cvec_counts = cvec.transform(df['titulo'])
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (3055, 4360)
nonzero count: 65589
sparsity: 0.49%


In [8]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

Unnamed: 0,term,occurrences
4139,use,1131
2486,model,856
2638,network,765
161,analysi,706
419,base,636
1005,data,592
393,bas,568
231,approach,522
3809,studi,472
121,algorithm,433


In [9]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights

<3055x4360 sparse matrix of type '<class 'numpy.float64'>'
	with 65589 stored elements in Compressed Sparse Row format>

In [10]:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

cvec_counts = cvec.transform(df['titulo'])
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (3055, 4360)
nonzero count: 65589
sparsity: 0.49%


In [19]:
#pd.DataFrame(transformed_weights.toarray(), columns=cvec.get_feature_names())

nmf = NMF(n_components=50, solver="mu")
idx_to_word = np.array(cvec.get_feature_names())
W = nmf.fit_transform(transformed_weights)
H = nmf.components_

for i, topic in enumerate(H):
    print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))

Topic 1: statist,social network,human,network,analysi base,approach,network analysi,measur,use,analysi
Topic 2: code,ecosystem,studi,agil,softwar engin,project,engin,softwar develop,develop,softwar
Topic 3: schedul problem,rout problem,solv,rout,schedul,heurist,genet algorithm,genet,algorithm,problem
Topic 4: wireless,mobil,alloc,rout,defin,virtual,optic network,optic,manag,network
Topic 5: deep,recognit,artifici neural,artifici,classif,convolut neural,convolut,network,neural network,neural
Topic 6: collabor,machin learn,machin,virtual learn,learn object,educ,virtual,learn environ,environ,learn
Topic 7: framework,transpar,classif,stream,data stream,big data,open data,big,open,data
Topic 8: amazon,social network,studi brazilian,urban,activ,public,social,industri,research,brazilian
Topic 9: framework,forecast,integr,estim,model use,bas model,model driven,driven,model base,model
Topic 10: larg,threshold,calculus,number,color graph,convex,path,edge,color,graph
Topic 11: classif,configur,br

In [12]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=.0010, max_df=.5, ngram_range=(1,2))
tfidf = tfidf_vectorizer.fit_transform(df.titulo)
weights = np.asarray(tfidf.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': tfidf_vectorizer.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)



Unnamed: 0,term,weight
4139,use,0.025585
2486,model,0.022534
2638,network,0.020671
161,analysi,0.018871
1005,data,0.017372
419,base,0.017317
393,bas,0.015733
231,approach,0.015379
2158,learn,0.014275
3809,studi,0.01401


In [13]:
print('vocabulary:', len(tfidf_vectorizer.vocabulary_))
print('sparse matrix shape:', tfidf.shape)
print('nonzero count:', tfidf.nnz)
print('sparsity: %.2f%%' % (100.0 * tfidf.nnz / (tfidf.shape[0] * tfidf.shape[1])))

vocabulary: 4360
sparse matrix shape: (3055, 4360)
nonzero count: 65589
sparsity: 0.49%


In [40]:
nmf = NMF(n_components=50, solver="mu")
idx_to_word = np.array(tfidf_vectorizer.get_feature_names())
W = nmf.fit_transform(tfidf)
H = nmf.components_

for i, topic in enumerate(H):
    print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))

Topic 1: social,social network,analysi base,use,network analysi,identif,measur,gene,network,analysi
Topic 2: code,studi,ecosystem,agil,softwar engin,project,engin,softwar develop,develop,softwar
Topic 3: schedul problem,solv,rout problem,rout,schedul,heurist,genet algorithm,genet,algorithm,problem
Topic 4: channel,defin,rout,lifetim,manag,wireless sensor,sensor network,wireless,sensor,network
Topic 5: learn,imag,recognit,deep,convolut neural,classif,convolut,neural network,neural,network
Topic 6: object,context,reinforc learn,virtual learn,machin learn,machin,learn object,environ,learn environ,learn
Topic 7: data analysi,classif,transpar,data stream,stream,big data,open data,big,open,data
Topic 8: analysi brazilian,technolog,scienc,transfer,studi brazilian,urban traffic,amazon,research,public,brazilian
Topic 9: conceptu,framework,model use,cellular,integr,model driven,driven,model base,base,model
Topic 10: modal,complex,number,calculus,color graph,convex,path,edge,color,graph
Topic 11:

In [39]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

no_top_words = 10
display_topics(nmf,idx_to_word,no_top_words)


Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,...,Topic 45 words,Topic 45 weights,Topic 46 words,Topic 46 weights,Topic 47 words,Topic 47 weights,Topic 48 words,Topic 48 weights,Topic 49 words,Topic 49 weights
0,analysi,2.8,softwar,2.2,problem,1.9,network,1.3,neural,1.5,...,fuzzi,1.6,ontolog,2.0,fault,1.4,adapt,2.0,classif,2.0
1,network,0.4,develop,1.2,algorithm,1.1,sensor,1.2,neural network,1.4,...,cluster,1.2,semant,0.6,toler,0.9,implement,0.5,deep,0.5
2,approach,0.3,softwar develop,0.6,genet,0.5,wireless,1.2,network,1.4,...,fuzzi cluster,0.6,relat,0.5,fault toler,0.8,stream,0.4,classif use,0.4
3,measur,0.3,engin,0.6,genet algorithm,0.4,sensor network,1.1,convolut,0.6,...,method,0.5,entiti,0.4,diagnosi,0.6,effici,0.4,deep learn,0.4
4,social,0.2,softwar engin,0.4,heurist,0.4,wireless sensor,1.1,convolut neural,0.5,...,multivari,0.4,portugues,0.4,power,0.4,languag,0.4,label,0.3
5,use,0.2,ecosystem,0.3,schedul,0.4,lifetim,0.2,artifici,0.5,...,cluster method,0.4,conceptu,0.4,reconfigur,0.4,self,0.3,data classif,0.3
6,social network,0.2,agil,0.3,rout,0.3,channel,0.2,artifici neural,0.5,...,dimension,0.3,align,0.3,storag,0.3,data stream,0.3,textur,0.3
7,analysi base,0.2,studi,0.3,rout problem,0.3,defin,0.2,recognit,0.3,...,membership,0.3,domain,0.3,fault diagnosi,0.3,automata,0.2,imag classif,0.3
8,network analysi,0.2,project,0.3,schedul problem,0.3,wireless network,0.2,deep,0.2,...,use fuzzi,0.3,represent,0.3,automat,0.3,self adapt,0.2,automat,0.3
9,complex,0.2,softwar ecosystem,0.2,solv,0.3,rout,0.2,imag,0.2,...,strategi,0.3,goal,0.3,motor,0.3,p2p,0.2,hierarch,0.2
