# Imports and Functions

In [1]:
import pandas as pd
from time import time

import visuals as vs 

%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

# Load Data

In [3]:
df_data = pd.read_csv('../data/data_feat.csv')
df_data.head()

Unnamed: 0,SUBJECT,AUTHOR,TEXT,TEXT_CLEAN,NUM_STOPWORDS,NUM_PUNCTUATIONS,NUM_WORDS_UPPER,NUM_CHARS,NUM_WORDS,AVG_WORD
0,ASSUNTOS VARIADOS,LUIZ FLAVIO SAPORI,19/03/2009\nFATO CORRIQUEIRO\nO fato mais impo...,19 03 2009 fato corriqueiro fato importante se...,67,29,5,1375,199,5.787129
1,ASSUNTOS VARIADOS,LUIZ FLAVIO SAPORI,06/07/2009\nPOLÍTICA DE ESTADO\nUm problema cr...,06 07 2009 politica estado problema cronico po...,75,29,11,1342,207,5.449275
2,ASSUNTOS VARIADOS,LUIZ FLAVIO SAPORI,12/07/2009\nTOQUE DE RECOLHER\nProcurado para ...,12 07 2009 toque recolher procurado suspender ...,95,46,11,1587,257,5.180392
3,ASSUNTOS VARIADOS,LUIZ FLAVIO SAPORI,23/03/2009\nCRISE SOCIOEDUCATIVA\nOcorreram du...,23 03 2009 crise socioeducativa ocorreram duas...,55,26,6,1141,167,5.761905
4,ASSUNTOS VARIADOS,LUIZ FLAVIO SAPORI,24/12/2008\nESTATÍSTICAS CRIMINAIS CONFIÁVEIS\...,24 12 2008 estatisticas criminais confiaveis b...,133,43,17,2309,366,5.246612


# Prep Data for FastText

In [4]:
df_data['SUBJECT'] = df_data['SUBJECT'].astype('category')
df_data['AUTHOR'] = df_data['AUTHOR'].astype('category')

In [8]:
df_data['AUTHOR_CODE'] = df_data['AUTHOR'].astype('category').cat.codes

In [9]:
df_data.head()

Unnamed: 0,SUBJECT,AUTHOR,TEXT,TEXT_CLEAN,NUM_STOPWORDS,NUM_PUNCTUATIONS,NUM_WORDS_UPPER,NUM_CHARS,NUM_WORDS,AVG_WORD,AUTHOR_CODE
0,ASSUNTOS VARIADOS,LUIZ FLAVIO SAPORI,19/03/2009\nFATO CORRIQUEIRO\nO fato mais impo...,19 03 2009 fato corriqueiro fato importante se...,67,29,5,1375,199,5.787129,62
1,ASSUNTOS VARIADOS,LUIZ FLAVIO SAPORI,06/07/2009\nPOLÍTICA DE ESTADO\nUm problema cr...,06 07 2009 politica estado problema cronico po...,75,29,11,1342,207,5.449275,62
2,ASSUNTOS VARIADOS,LUIZ FLAVIO SAPORI,12/07/2009\nTOQUE DE RECOLHER\nProcurado para ...,12 07 2009 toque recolher procurado suspender ...,95,46,11,1587,257,5.180392,62
3,ASSUNTOS VARIADOS,LUIZ FLAVIO SAPORI,23/03/2009\nCRISE SOCIOEDUCATIVA\nOcorreram du...,23 03 2009 crise socioeducativa ocorreram duas...,55,26,6,1141,167,5.761905,62
4,ASSUNTOS VARIADOS,LUIZ FLAVIO SAPORI,24/12/2008\nESTATÍSTICAS CRIMINAIS CONFIÁVEIS\...,24 12 2008 estatisticas criminais confiaveis b...,133,43,17,2309,366,5.246612,62


In [10]:
classes = dict(enumerate(df_data['AUTHOR'].cat.categories))
classes

{0: 'ACÍLIO LARA REZENDE',
 1: 'ADRIANO GAMBARINI',
 2: 'ALESSANDRA BLANCO',
 3: 'ALEXANDRE MAGALHÃES',
 4: 'ANA CRISTINA CAVALCANTE',
 5: 'ANDRE RIBEIRO',
 6: 'ANDREA KAUFMANN',
 7: 'ANTONIO PIETROBELLI',
 8: 'ARNALDO JABOR',
 9: 'AUGUSTO MAFUZ',
 10: 'BADGER VICARI',
 11: 'BENEDICTO DUTRA',
 12: 'BOLESLAU SLIVIANY',
 13: 'CARLA KREEFT',
 14: 'CARLOS BERTOLAZZI',
 15: 'CARLOS BRICKMANN',
 16: 'CARLOS SARLI',
 17: 'CARLOS ZAMITH JUNIOR',
 18: 'CECILIA GIANNETTI',
 19: 'CEZAR TAURION',
 20: 'CILMARA CASTILHO',
 21: 'CLAUDIO GRADILONE',
 22: 'CLAUDIO HUMBERTO',
 23: 'CLAUDIO LIMA',
 24: 'CLAUDIO SCHAMIS',
 25: 'DENNY ROGER',
 26: 'DIOGO OLIVIER',
 27: 'DRAUZIO VARELA',
 28: 'EDUARDO TUDE',
 29: 'EWANDRO SCHENKEL',
 30: 'FABIO CAMPANA',
 31: 'FABIO CAMPOS',
 32: 'FABIO CESAR DOS SANTOS',
 33: 'FABIO TOKARS',
 34: 'FABIO ZANINI',
 35: 'FERNANDA ARANDA',
 36: 'FERNANDO BIRMAN',
 37: 'FERNANDO CANZIAN',
 38: 'FERNANDO CESAR FARIA',
 39: 'FERNANDO MONTEIRO',
 40: 'FLAVIO SETTANNI',
 41: 'FRED

fastText requires a text file with each piece of text on a line by itself. The beginning of each line needs to have a special prefix of `__label__YOURLABEL` that assigns the label to that piece of text.

In [14]:
from pathlib import Path

fasttext_data = Path('../data/fasttext_dataset.txt ')
with fasttext_data.open("w") as output:
    for index, row in df_data.iterrows():
        label = row['AUTHOR_CODE']
        text = row['TEXT_CLEAN']

        fasttext_line = "__label__{} {}".format(label, text)

        output.write(fasttext_line + "\n")

# Split Dataset

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_data[['TEXT_CLEAN', 'AUTHOR_CODE']], df_data['AUTHOR'],
                                                    test_size=0.2, stratify=df_data['AUTHOR'], random_state=42)

In [20]:
from pathlib import Path
import random

training_data = Path("../data/fasttext_dataset_training.txt")
test_data = Path("../data/fasttext_dataset_test.txt")

with training_data.open("w") as train_output:

    for index, row in X_train.iterrows():
        label = row['AUTHOR_CODE']
        text = row['TEXT_CLEAN']

        fasttext_line = "__label__{} {}".format(label, text)

        train_output.write(fasttext_line + "\n")
            
with test_data.open("w") as test_output:

    for index, row in X_test.iterrows():
        label = row['AUTHOR_CODE']
        text = row['TEXT_CLEAN']

        fasttext_line = "__label__{} {}".format(label, text)

        test_output.write(fasttext_line + "\n")

# Install FastText

In [9]:
! git clone https://github.com/facebookresearch/fastText.git

Cloning into 'fastText'...
remote: Enumerating objects: 2, done.[K
remote: Counting objects: 100% (2/2), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3197 (delta 0), reused 1 (delta 0), pack-reused 3195[K
Receiving objects: 100% (3197/3197), 7.84 MiB | 616.00 KiB/s, done.
Resolving deltas: 100% (2007/2007), done.


In [18]:
import os
os.chdir('fastText')

In [15]:
! make

c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/args.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/matrix.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/dictionary.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/loss.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/productquantizer.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/densematrix.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/quantmatrix.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/vector.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/model.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/utils.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/meter.cc
c++ -pthread -std=c++0x -march=native -O3 -funroll-loops -c src/fasttext.cc
      selectEmbeddings is being deprecated. [-Wdeprecated-declarations][0m
    auto idx = se

# Train Model

In [20]:
! pwd
! ls

/Users/thiagovieira/Documents/GitHub/aa-udacity-capstone/notebooks/fastText
CMakeLists.txt            dictionary.o              quantmatrix.o
CODE_OF_CONDUCT.md        [1m[36mdocs[m[m                      runtests.py
CONTRIBUTING.md           eval.py                   [1m[36mscripts[m[m
LICENSE                   [31mfasttext[m[m                  setup.cfg
MANIFEST.in               fasttext.o                setup.py
Makefile                  [31mget-wikimedia.sh[m[m          [1m[36msrc[m[m
README.md                 loss.o                    [1m[36mtests[m[m
[1m[36malignment[m[m                 matrix.o                  utils.o
args.o                    meter.o                   vector.o
[31mclassification-example.sh[m[m model.o                   [1m[36mwebsite[m[m
[31mclassification-results.sh[m[m productquantizer.o        wikifil.pl
[1m[36mcrawl[m[m                     [1m[36mpython[m[m                    [31mword-vector-example.sh[m[m
den

In [22]:
! ./fastText supervised -input ../../data/fasttext_dataset_training.txt -output authors_model 

Read 0M words
Number of words:  62623
Number of labels: 100
Progress: 100.0% words/sec/thread:  390279 lr:  0.000000 loss:  4.628903 ETA:   0h 0m


In [23]:
! ./fastText test authors_model.bin ../../data/fasttext_dataset_test.txt

N	601
P@1	0.324
R@1	0.324


# Ngrams

In [24]:
! ./fastText supervised -input ../../data/fasttext_dataset_training.txt -output authors_model_ngrams -wordNgrams 2

Read 0M words
Number of words:  62623
Number of labels: 100
Progress: 100.0% words/sec/thread:  232783 lr:  0.000000 loss:  4.628904 ETA:   0h 0m


In [25]:
! ./fastText test authors_model_ngrams.bin ../../data/fasttext_dataset_test.txt

N	601
P@1	0.201
R@1	0.201
