In [1]:
import pandas as pd
import numpy as np
from itertools import islice
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
select_year = 2017
input = './data/out/%dpreprocess.csv.xz' % select_year

In [3]:
df = pd.read_csv(input, compression='xz')
df.shape

(3111, 1)

In [4]:
stemmer = SnowballStemmer("english")
df['titulo'] = df['titulo'].map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))

In [5]:
# Initialize the vectorizer with new settings and check the new vocabulary length
cvec = CountVectorizer(stop_words='english', min_df=.0005, max_df=.15, ngram_range=(1,3))
cvec.fit(df['titulo'])
list(islice(cvec.vocabulary_.items(), 20))
#len(cvec.vocabulary_)

[('exploit', 8811),
 ('photo', 17378),
 ('locat', 13217),
 ('direct', 6918),
 ('clusteringbas', 4269),
 ('pointsofinterest', 17647),
 ('discoveri', 6956),
 ('goldstandard', 10240),
 ('social', 21423),
 ('media', 13940),
 ('corpus', 5398),
 ('urban', 24345),
 ('issu', 12372),
 ('framework', 9591),
 ('spatial', 21770),
 ('analyt', 1191),
 ('heterogen', 10667),
 ('data', 5830),
 ('sourc', 21688),
 ('exploit photo', 8828)]

In [6]:
len(cvec.vocabulary_)

25714

In [7]:
cvec_counts = cvec.transform(df['titulo'])
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (3111, 25714)
nonzero count: 107694
sparsity: 0.13%


In [8]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

Unnamed: 0,term,occurrences
5830,data,576
1561,approach,523
22328,studi,476
717,algorithm,434
12774,learn,407
21471,softwar,400
1464,applic,368
8422,evalu,354
18155,problem,351
14114,method,331


In [9]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights

<3111x25714 sparse matrix of type '<class 'numpy.float64'>'
	with 107694 stored elements in Compressed Sparse Row format>

In [10]:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
5830,data,0.012218
1561,approach,0.011334
22328,studi,0.010269
717,algorithm,0.010018
12774,learn,0.009732
21471,softwar,0.008948
18155,problem,0.008689
1464,applic,0.008523
14114,method,0.008353
6478,detect,0.007915


In [11]:
pd.DataFrame(transformed_weights.toarray(), columns=cvec.get_feature_names())

Unnamed: 0,02,03h1,03h1 proteobacterium,03h1 proteobacterium obtain,10,10 1007,100,1000,1000 genom,1000 genom databas,...,µbrkga parallel bias,água,área,árvore,éu,único,único saúd,βglucosidas,βglucosidas use,βglucosidas use biofuel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
