In [1]:
import pandas as pd
import numpy as np
from itertools import islice

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import NMF

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
select_year = 2017
input = './data/out/%dpreprocess.csv.xz' % select_year

In [3]:
df = pd.read_csv(input, compression='xz')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3055 entries, 0 to 3054
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   titulo  3055 non-null   object
dtypes: object(1)
memory usage: 24.0+ KB


In [4]:
stemmer = SnowballStemmer("english")
df['titulo'] = df['titulo'].map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))

In [5]:
# Initialize the vectorizer with new settings and check the new vocabulary length
cvec = CountVectorizer(stop_words='english', min_df=.0025, max_df=.5, ngram_range=(1,2))
cvec.fit(df['titulo'])
list(islice(cvec.vocabulary_.items(), 20))
#len(cvec.vocabulary_)

[('exploit', 566),
 ('locat', 890),
 ('direct', 453),
 ('bas', 138),
 ('discoveri', 455),
 ('gold', 674),
 ('standard', 1496),
 ('social', 1445),
 ('media', 935),
 ('corpus', 348),
 ('urban', 1651),
 ('issu', 824),
 ('framework', 629),
 ('spatial', 1478),
 ('analyt', 65),
 ('use', 1654),
 ('heterogen', 706),
 ('data', 383),
 ('sourc', 1468),
 ('social media', 1446)]

In [6]:
len(cvec.vocabulary_)

1738

In [7]:
cvec_counts = cvec.transform(df['titulo'])
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (3055, 1738)
nonzero count: 52768
sparsity: 0.99%


In [8]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

Unnamed: 0,term,occurrences
1654,use,1131
978,model,856
1031,network,765
61,analysi,706
144,base,636
383,data,592
138,bas,568
81,approach,522
1520,studi,472
46,algorithm,433


In [9]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights

<3055x1738 sparse matrix of type '<class 'numpy.float64'>'
	with 52768 stored elements in Compressed Sparse Row format>

In [10]:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
1654,use,0.030496
978,model,0.026791
1031,network,0.024483
61,analysi,0.022708
383,data,0.020849
144,base,0.02066
138,bas,0.018881
81,approach,0.018515
1520,studi,0.016754
46,algorithm,0.01651


In [11]:
#pd.DataFrame(transformed_weights.toarray(), columns=cvec.get_feature_names())

nmf = NMF(n_components=50, solver="mu")
idx_to_word = np.array(cvec.get_feature_names())
W = nmf.fit_transform(transformed_weights)
H = nmf.components_

for i, topic in enumerate(H):
    print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))

Topic 1: effici,energi,design,generat,simul,improv,low,parallel,high,perform
Topic 2: code,softwar ecosystem,requir,ecosystem,softwar develop,studi,project,softwar engin,engin,softwar
Topic 3: rout,search,solv,schedul problem,schedul,heurist,genet algorithm,genet,algorithm,problem
Topic 4: vehicular,optic,alloc,optic network,traffic,rout,defin,servic,manag,network
Topic 5: deep,use,recognit,artifici neural,artifici,convolut neural,convolut,network,neural network,neural
Topic 6: deep,teach,virtual learn,machin learn,machin,learn object,educ,environ,learn environ,learn
Topic 7: data stream,scientif workflow,open data,visual,workflow,scientif,big data,open,big,data
Topic 8: strategi,compar studi,educ,use,studi brazilian,analysi,sustain,case studi,case,studi
Topic 9: bas model,driven,model use,data,estim,simul,integr,conceptu,model base,model
Topic 10: rank,complex,number,larg,threshold,convex,modal,color,edge,graph
Topic 11: concept,approach,analysi base,scientif,measur,network,social net