In [1]:
import pandas as pd
import numpy as np
import string

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from time import time
from pathlib import Path
from collections import Counter

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer # PorterStemmer
from nltk.corpus import stopwords

DATA_PATH = './data'
PLOT_PATH = './docs/plots'

In [2]:
# Data Read
data_file = '{}/01pre_lattes.pkl.xz'.format(DATA_PATH)
%time df = pd.read_pickle(data_file, compression='xz')
print('{} df shape'.format(df.shape))
df.head(5)

CPU times: user 465 ms, sys: 24.9 ms, total: 490 ms
Wall time: 502 ms
(172165, 3) df shape


Unnamed: 0,id,titulo,ano
0,101982954114164,3D reconstruction methods for digital preserva...,2014
1,101982954114164,3D Viewer Software Build Based on Scanned Synt...,2016
2,102488447573085,The Globalization Strategy of a High-Tech Mult...,1996
3,102488447573085,Information Systems as an Instrument of Qualit...,2000
4,102488447573085,The Experience of a Cardiology Unit in the Dev...,2002


In [3]:
# Descriptive statistics for word count
df['wct'] = df['titulo'].str.split().str.len()
df.wct.describe()

count    172165.000000
mean         10.482055
std           3.873838
min           1.000000
25%           8.000000
50%          10.000000
75%          13.000000
max          53.000000
Name: wct, dtype: float64

In [4]:
# Normalização

df.insert(2, 'clean', df.titulo)

def remove_separation(doc):
    table = doc.maketrans('-/', '  ')
    return doc.translate(table)

def strip_ponct(doc):
    table = str.maketrans({key: None for key in string.punctuation})
    return doc.translate(table)

def normaliza(doc):
    doc = remove_separation(doc)
    doc = strip_ponct(doc)
    doc = doc.lower()
    return doc

t = time()
df.clean = df.clean.apply(normaliza)

print('Time: {} mins'.format(round((time() - t) / 60, 2)))
df.head(10)

Time: 0.03 mins


Unnamed: 0,id,titulo,clean,ano,wct
0,101982954114164,3D reconstruction methods for digital preserva...,3d reconstruction methods for digital preserva...,2014,11
1,101982954114164,3D Viewer Software Build Based on Scanned Synt...,3d viewer software build based on scanned synt...,2016,12
2,102488447573085,The Globalization Strategy of a High-Tech Mult...,the globalization strategy of a high tech mult...,1996,18
3,102488447573085,Information Systems as an Instrument of Qualit...,information systems as an instrument of qualit...,2000,11
4,102488447573085,The Experience of a Cardiology Unit in the Dev...,the experience of a cardiology unit in the dev...,2002,17
5,102488447573085,Information Systems as an Instrument for Quali...,information systems as an instrument for quali...,2002,8
6,102488447573085,Clustering and Categorization Applied to Crypt...,clustering and categorization applied to crypt...,2006,6
7,102488447573085,Criptoanalisys Outwit using Context Sensitive ...,criptoanalisys outwit using context sensitive ...,2016,6
8,102488447573085,Cryptographic Algorithm Identification Using M...,cryptographic algorithm identification using m...,2016,9
9,103102694865890,Influence of baroclinic sistems in severe rain...,influence of baroclinic sistems in severe rain...,2011,11


In [5]:
# Stopwords / Lemma

default_stemmer = SnowballStemmer('english')
default_lemmatizer = WordNetLemmatizer()
default_stopwords = stopwords.words('english')

noise_file = open('{}/extra/noise.txt'.format(DATA_PATH),'r')
noise = noise_file.read().splitlines()

def remove_stopwords(doc, stop_words=default_stopwords):
    words = [w for w in doc if not w in stop_words]
    return words

def lemm_text(doc, lemmatizer=default_lemmatizer):
    words = [lemmatizer.lemmatize(w, pos='v') for w in doc]
    return words

def stem_text(doc, stemmer=default_stemmer):
    words = [stemmer.stem(w) for w in doc]
    return words

def remove_noise(doc):
    words = [w for w in doc if not w in noise]
    return words

def preprocess(doc):
    doc = word_tokenize(doc)
    doc = remove_stopwords(doc)
    doc = lemm_text(doc)
    doc = stem_text(doc)
    doc = [w for w in doc if len(w) > 1]
    doc = remove_noise(doc)
    return doc

t = time()
df.clean = df.clean.apply(preprocess)
print('Time: {} mins'.format(round((time() - t) / 60, 2)))
df.head(10)

Time: 1.23 mins


Unnamed: 0,id,titulo,clean,ano,wct
0,101982954114164,3D reconstruction methods for digital preserva...,"[3d, reconstruct, method, digit, preserv, cult...",2014,11
1,101982954114164,3D Viewer Software Build Based on Scanned Synt...,"[3d, viewer, softwar, build, base, scan, synth...",2016,12
2,102488447573085,The Globalization Strategy of a High-Tech Mult...,"[global, strategi, high, tech, multin, corpor,...",1996,18
3,102488447573085,Information Systems as an Instrument of Qualit...,"[inform, system, instrument, qualiti, program,...",2000,11
4,102488447573085,The Experience of a Cardiology Unit in the Dev...,"[experi, cardiolog, unit, develop, qualiti, pr...",2002,17
5,102488447573085,Information Systems as an Instrument for Quali...,"[inform, system, instrument, qualiti, program]",2002,8
6,102488447573085,Clustering and Categorization Applied to Crypt...,"[cluster, categor, appli, cryptanalysi]",2006,6
7,102488447573085,Criptoanalisys Outwit using Context Sensitive ...,"[criptoanalisi, outwit, use, context, sensit, ...",2016,6
8,102488447573085,Cryptographic Algorithm Identification Using M...,"[cryptograph, algorithm, identif, use, machin,...",2016,9
9,103102694865890,Influence of baroclinic sistems in severe rain...,"[influenc, baroclin, sistem, sever, rainstorm,...",2011,11


In [6]:
df['wcc'] = df.apply(lambda row: len(row['clean']), axis=1)
df.wcc.describe()

count    172165.000000
mean          8.013772
std           2.750311
min           0.000000
25%           6.000000
50%           8.000000
75%          10.000000
max          48.000000
Name: wcc, dtype: float64

In [7]:
df.drop(df.loc[df['wcc']==0].index, inplace=True)
df.reset_index(inplace=True, drop=True)

In [8]:
# Graphs Plot foo's

# Plot a hist of the word counts
def graph_hist(df, year=None):
    fig = plt.figure(figsize=(10,5))

    plt.hist(df.wct, bins=20, color='#60505C')

    plt.ylabel('Frequency', fontsize=12)
    plt.xlabel('Word Count', fontsize=12)
    #plt.yticks(np.arange(0, 2000, 200))
    #plt.xticks(np.arange(0, 600, 50))

    if year != None:
        plt.title('{} Docs Word Count'.format(year), fontsize=16)
        path = '{}/{}'.format(PLOT_PATH,year)
        Path(path).mkdir(exist_ok=True)
    else:
        plt.title('Docs Word Count', fontsize=16)
        path = PLOT_PATH

    file_name = '{}/hist.png'.format(path)
    fig.savefig(file_name, dpi=fig.dpi, bbox_inches='tight')
    plt.close()

# Plot a boxplot of the word counts
def graph_boxplot(df, year=None):
    fig = plt.figure(figsize=(4,9))

    sns.boxplot(df.wct, orient='v', width=.5, color='#ff8080')

    plt.ylabel("Word Count", fontsize=12)
    #plt.yticks(np.arange(0, 2700, 100))

    if year != None:
        plt.title('{} Docs Word Count'.format(year), fontsize=16)
        path = '{}/{}'.format(PLOT_PATH,year)
        Path(path).mkdir(exist_ok=True)
    else:
        plt.title('Docs Word Count', fontsize=16)
        path = PLOT_PATH

    file_name = '{}/box_plot.png'.format(path)
    fig.savefig(file_name, dpi=fig.dpi, bbox_inches='tight')
    plt.close()

def get_top20(df):
    p_text = df.clean
    p_text = [item for sublist in p_text for item in sublist]
    df_top_20 = pd.DataFrame(
        Counter(p_text).most_common(20),
        columns=['word', 'frequency']
    )
    return p_text, df_top_20

# Plot a bar chart for the top 20 most frequently occuring words
def graph_top20(df_top20, year=None):
    fig = plt.figure(figsize=(20,7))
    g = sns.barplot(x='word', y='frequency', data=df_top20, palette='GnBu_d')
    g.set_xticklabels(g.get_xticklabels(),rotation=45,fontsize=14)

    plt.yticks(fontsize=14)
    plt.xlabel('Words', fontsize=14)
    plt.ylabel('Frequency', fontsize=14)

    if year != None:
        plt.title('{} Top 20 Words'.format(year), fontsize=17)
        path = '{}/{}'.format(PLOT_PATH,year)
        Path(path).mkdir(exist_ok=True)
    else:
        plt.title('Top 20 Words', fontsize=17)
        path = PLOT_PATH

    file_name = '{}/top_words.png'.format(path)
    fig.savefig(file_name, dpi=fig.dpi, bbox_inches='tight')
    plt.close()


In [9]:
# Save

t = time()
pre_file = '{}/02pre_lattes.pkl.xz'.format(DATA_PATH)
df.to_pickle(pre_file, compression='xz')

# Get the top 20 most common words among all the articles
p_text, df_top20 = get_top20(df)

graph_hist(df)
graph_boxplot(df)
graph_top20(df_top20)

# Get the number of unique words after processing
#num_unique_words = len(set(p_text))
#num_unique_words
del p_text, df_top20

for year in range(2008,2019):

    df_year = df.loc[df['ano']==year]
    df_year.drop(['ano'], axis=1, inplace=True)

    p_text, df_top20 = get_top20(df_year)

    graph_hist(df_year,year)
    graph_boxplot(df_year,year)
    graph_top20(df_top20,year)

    pkl_file = '{}/{}_lattes.pkl.xz'.format(DATA_PATH,year)
    df_year.to_pickle(pkl_file, compression='xz')
    del df_year, p_text, df_top20

print('Time: {} mins'.format(round((time() - t) / 60, 2)))

Time: 0.83 mins
