In [None]:
%tensorflow_version 1.x
SIZE = "2k4"
C = "b"
FOLDER = "/" #path to model folder
FOLDS_PATH = "/" #path where the folds are saved
DATA_FOLDER = "/" #path where the entire dataset is saved
BATCH_SIZE = 32 

CLASS_NAME = {2:['Sem Risco', 'Risco Potencial'], 3:['Sem Risco', 'Risco Potencial', 'Risco Alto']}

In [None]:
import numpy as np 
import pandas as pd 
import os
from pathlib import Path
from fastai.text import *
from time import ctime


In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Loading dataset and doing cross validation

In [None]:
folds = int(len([name for name in os.listdir(FOLDS_PATH) if os.path.isfile(os.path.join(FOLDS_PATH,name))]) / 2)
print("Training on {} folds".format(folds))

val_acc = []
val_prec = []
val_rec = []
val_f1 = []


for f in range(int(folds)):  
  df_train = pd.read_csv(FOLDS_PATH + "train"+str(f) + ".csv", sep="\t", index_col=False)
  df_valid = pd.read_csv(FOLDS_PATH + "test"+str(f) + ".csv", sep="\t", index_col=False)
  print("Fold",f,'\n')
  data_lm = TextLMDataBunch.from_df(path=FOLDER,
                                  train_df=df_train,
                                  valid_df=df_valid, 
                                  label_cols='y', 
                                  text_cols='text')

  lm = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)
  lm.save_encoder(FOLDER+'language_model')

  data_clas = TextClasDataBunch.from_df(path=FOLDER,
                                      train_df=df_train, 
                                      valid_df=df_valid, 
                                      vocab=data_lm.train_ds.vocab,
                                      label_cols='y',
                                      text_cols='text')
  
  learner = text_classifier_learner(data_clas, 
                                 AWD_LSTM,
                                 drop_mult=0.3,                                 
                                 metrics=[accuracy,
                                          Precision(average="macro"), 
                                          Recall(average="macro"), 
                                          FBeta(average="macro", beta=1)])  
  learner.load_encoder(FOLDER+'language_model')
  # learner.lr_find()
  # learner.recorder.plot()
  learner.fit_one_cycle(5, max_lr=1e-2)

  loss_value, acc_value, prec_value, rec_value, f1_value = learner.validate()
  val_acc.append(acc_value.item())
  val_prec.append(prec_value.item())
  val_rec.append(rec_value.item())
  val_f1.append(f1_value.item())

#Classificador com Holdout

In [None]:
learner.destroy()
train_df = pd.read_csv(DATA_FOLDER + "ideacao-{}-{}-train.csv".format(SIZE, C), sep="\t", index_col=False)
test_df = pd.read_csv(DATA_FOLDER + "ideacao-{}-{}-test.csv".format(SIZE, C), sep="\t", index_col=False)

CLASSES = len(df_valid['y'].unique())


data_lm = TextLMDataBunch.from_df(path=FOLDER,
                                  train_df=df_train,
                                  valid_df=df_valid, 
                                  label_cols='y', 
                                  text_cols='text')

data_lm.save(FOLDER+'data_lm.pkl')

lm = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)
lm.save_encoder(FOLDER+'language_model')

data_clas=TextClasDataBunch.from_df(path=FOLDER,
                                    train_df=df_train, 
                                    valid_df=df_valid, 
                                    vocab=data_lm.train_ds.vocab,
                                    label_cols='y',
                                    text_cols='text')
h_learner = text_classifier_learner(data_clas,AWD_LSTM,drop_mult=0.3, 
                               metrics=[accuracy, Precision(average="macro"), 
                                        Recall(average="macro"), 
                                        FBeta(average="macro", beta=1)])
h_learner.load_encoder(FOLDER+'language_model')

h_learner.fit_one_cycle(5, max_lr=1e-2)
loss, acc, prec, rec, f1 = h_learner.validate()

#Salvando resultados em arquivo

In [None]:
created_at = ctime()
created_at = created_at.replace(' ', '-')
name = "{}-{}-{}-{}-results.csv".format("LSTM", SIZE, C, created_at)
with open(os.path.join(FOLDER, 'results', name), 'w') as f:
  f.write("folds, acc, prec, rec, f1\n")
  f.write("{},{},{},{},{}\n".format(folds, np.mean(val_acc), np.mean(val_prec), np.mean(val_rec), np.mean(val_f1)))
  f.write("{},{},{},{},{}\n".format('-', acc, prec, rec, f1))