In [1]:
import pandas as pd
import numpy as np
from itertools import islice
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
input = './data/out/preprocess.csv.xz'

In [3]:
df = pd.read_csv(input, compression='xz')
df.drop(['id'], axis=1, inplace=True)
df.shape

(9130, 1)

In [4]:
stemmer = SnowballStemmer("english")
df['titulo'] = df['titulo'].map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))

In [5]:
# Initialize the vectorizer with new settings and check the new vocabulary length
cvec = CountVectorizer(stop_words='english', min_df=.0025, max_df=.1, ngram_range=(1,3))
cvec.fit(df['titulo'])
list(islice(cvec.vocabulary_.items(), 20))
#len(cvec.vocabulary_)

[('exploit', 233),
 ('locat', 360),
 ('direct', 183),
 ('bas', 57),
 ('discoveri', 184),
 ('standard', 617),
 ('social', 593),
 ('media', 384),
 ('urban', 691),
 ('issu', 333),
 ('social media', 594),
 ('framework', 262),
 ('spatial', 609),
 ('analyt', 26),
 ('heterogen', 289),
 ('data', 156),
 ('sourc', 607),
 ('gene', 269),
 ('express', 236),
 ('analysi', 25)]

In [6]:
len(cvec.vocabulary_)

719

In [7]:
cvec_counts = cvec.transform(df['titulo'])
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (9130, 719)
nonzero count: 49804
sparsity: 0.76%


In [8]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

Unnamed: 0,term,occurrences
398,model,858
421,network,764
25,analysi,708
60,base,640
156,data,599
57,bas,570
30,approach,523
628,studi,474
19,algorithm,434
345,learn,432


In [9]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights

<9130x719 sparse matrix of type '<class 'numpy.float64'>'
	with 49804 stored elements in Compressed Sparse Row format>

In [12]:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
398,model,0.025522
25,analysi,0.022144
421,network,0.021979
60,base,0.01903
156,data,0.019027
57,bas,0.017943
30,approach,0.016927
628,studi,0.015076
19,algorithm,0.014465
345,learn,0.014319


In [15]:
pd.DataFrame(transformed_weights.toarray(), columns=cvec.get_feature_names())

Unnamed: 0,3d,academ,acceler,access,accuraci,action,activ,adapt,adopt,advanc,...,water,web,weight,wind,wireless,wireless sensor,wireless sensor network,work,workflow,world
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.592943,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9125,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9126,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9127,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9128,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
