In [1]:
import pandas as pd
import numpy as np
from itertools import islice
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
select_year = 2017
input = './data/out/%dpreprocess.csv.xz' % select_year

In [3]:
df = pd.read_csv(input, compression='xz')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3111 entries, 0 to 3110
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   titulo  3111 non-null   object
dtypes: object(1)
memory usage: 24.4+ KB


In [4]:
stemmer = SnowballStemmer("english")
df['titulo'] = df['titulo'].map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))

In [5]:
# Initialize the vectorizer with new settings and check the new vocabulary length
cvec = CountVectorizer(stop_words='english', min_df=.0025, max_df=.15, ngram_range=(1,2))
cvec.fit(df['titulo'])
list(islice(cvec.vocabulary_.items(), 20))
#len(cvec.vocabulary_)

[('exploit', 583),
 ('locat', 913),
 ('direct', 464),
 ('bas', 140),
 ('discoveri', 466),
 ('gold', 694),
 ('standard', 1525),
 ('social', 1474),
 ('media', 958),
 ('corpus', 355),
 ('urban', 1681),
 ('issu', 845),
 ('framework', 647),
 ('spatial', 1507),
 ('analyt', 65),
 ('heterogen', 727),
 ('data', 393),
 ('sourc', 1497),
 ('social media', 1475),
 ('gene', 672)]

In [6]:
len(cvec.vocabulary_)

1768

In [7]:
cvec_counts = cvec.transform(df['titulo'])
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (3111, 1768)
nonzero count: 50625
sparsity: 0.92%


In [8]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

Unnamed: 0,term,occurrences
393,data,600
140,bas,570
83,approach,523
1549,studi,476
47,algorithm,434
879,learn,432
1478,softwar,401
81,applic,368
1226,problem,359
564,evalu,355


In [9]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights

<3111x1768 sparse matrix of type '<class 'numpy.float64'>'
	with 50625 stored elements in Compressed Sparse Row format>

In [10]:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,term,weight
393,data,0.020894
140,bas,0.018739
83,approach,0.018344
1549,studi,0.016634
47,algorithm,0.016358
879,learn,0.016252
1478,softwar,0.014673
1226,problem,0.014559
973,method,0.014058
81,applic,0.014042


In [11]:
pd.DataFrame(transformed_weights.toarray(), columns=cvec.get_feature_names())

Unnamed: 0,10,15,2015,2017,2d,3d,802,absolut,abstract,ac,...,women,word,work,workflow,workload,world,year,yield,zika,zone
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3106,0.0,0.0,0.0,0.0,0.497197,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3107,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3108,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3109,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
