# Analyze training result

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from collections import defaultdict
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

plt.style.use('ggplot')

import datetime as dt
import re
import pickle
from tqdm import tqdm_notebook as tqdm
import os
import sys
import time
import logging
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)

from transformers import *

ModuleNotFoundError: No module named 'torch'

## Load saved state and predict new text

In [11]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False
)

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [13]:
model.load_state_dict(torch.load('model_statement_fold_1.bin', map_location=torch.device('cpu')))
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def bert_encoder(text, max_len=512):
    """ Return embedded text vector as a list in max_len with a mask list"""
    text_token = tokenizer.tokenize(text)
    text_token = text_token[:max_len-2]
    text_token = ["[CLS]"] + text_token + ["[SEP]"]
    text_ids = tokenizer.convert_tokens_to_ids(text_token)
    text_ids += [0] * (max_len - len(text_token))
    pad_masks = [1] * len(text_token) + [0] * (max_len - len(text_token))
    segment_ids = [0] * len(text_token) + [0] * (max_len - len(text_token))
    
    return text_ids, pad_masks, segment_ids

In [14]:
file = open("../data/FOMC/train_all_df.pickle", "rb")
df = pickle.load(file)
file.close()

In [36]:
test_text = df.loc[df['type']=='speech']['text'].iloc[-1]
tokenizer.tokenize(test_text)

['good',
 'morning',
 '.',
 'i',
 'am',
 'pleased',
 'to',
 'be',
 'here',
 'at',
 'the',
 'urban',
 'institute',
 'to',
 'discuss',
 'how',
 'to',
 'strengthen',
 'the',
 'community',
 'rein',
 '##ves',
 '##tment',
 'act',
 '(',
 'cr',
 '##a',
 ')',
 ',',
 'which',
 'is',
 'a',
 'key',
 'priority',
 'for',
 'the',
 'federal',
 'reserve',
 '.',
 'the',
 'cr',
 '##a',
 'plays',
 'a',
 'vital',
 'role',
 'in',
 'bringing',
 'banks',
 'together',
 'with',
 'community',
 'members',
 ',',
 'small',
 'businesses',
 ',',
 'local',
 'officials',
 ',',
 'and',
 'community',
 'groups',
 'to',
 'make',
 'investments',
 'in',
 'their',
 'community',
 "'",
 's',
 'future',
 '.',
 '1',
 'that',
 'is',
 'why',
 'we',
 'are',
 'committed',
 'to',
 'getting',
 'cr',
 '##a',
 'reform',
 'done',
 'right',
 '.',
 'the',
 'origins',
 'and',
 'purpose',
 'of',
 'the',
 'cr',
 '##aan',
 '##y',
 'successful',
 'reform',
 'must',
 'be',
 'grounded',
 'in',
 'the',
 'origins',
 'of',
 'the',
 'cr',
 '##a',
 'an

In [48]:
input_ids, masks, segment_ids = bert_encoder(test_text)
input_ids = torch.tensor(input_ids).unsqueeze(0)
output = model(input_ids)[0].detach()
F.softmax(output, dim=1).numpy()[0]

array([0.12596154, 0.06326203, 0.8107765 ], dtype=float32)

## DataFrame with Prediction

In [49]:
file = open("trained_df.pickle", 'rb')
trained_df = pickle.load(file)

trained_df.tail(10)

Unnamed: 0,date,rate,speaker,target,text,type,target_lower,target_hold,target_raise,word_count,pred_target,pred_target_lower,pred_target_hold,pred_target_raise
717,2019-07-31,2.0,Jerome Powell,0,stands ready to adjust the details of these pl...,statement,1,0,0,197,1,0.086087,0.871317,0.042596
718,2019-07-31,2.0,Jerome Powell,0,that could adversely affect policy implementat...,statement,1,0,0,188,0,0.454112,0.283197,0.26269
719,2019-09-18,1.75,Jerome Powell,0,Information received since the Federal Open Ma...,statement,1,0,0,200,1,0.032904,0.802003,0.165094
720,2019-09-18,1.75,Jerome Powell,0,to 2-1/4 percent. This action supports the Com...,statement,1,0,0,198,2,0.19093,0.132232,0.676837
721,2019-10-11,1.75,Jerome Powell,1,Information received since the Federal Open Ma...,statement,0,1,0,200,1,0.033292,0.787344,0.179365
722,2019-10-11,1.75,Jerome Powell,1,action supports the Committee's view that sust...,statement,0,1,0,193,1,0.029169,0.920777,0.050054
723,2019-10-30,1.5,Jerome Powell,0,Information received since the Federal Open Ma...,statement,1,0,0,200,0,0.677126,0.118452,0.204422
724,2019-10-30,1.5,Jerome Powell,0,action supports the Committee's view that sust...,statement,1,0,0,199,1,0.052474,0.826548,0.120978
725,2019-12-11,1.5,Jerome Powell,1,Information received since the Federal Open Ma...,statement,0,1,0,201,1,0.045389,0.890039,0.064572
726,2020-01-29,1.5,Jerome Powell,1,Information received since the Federal Open Ma...,statement,0,1,0,200,1,0.018971,0.919591,0.061438


In [52]:
sum(trained_df['target']==trained_df['pred_target'])

646

In [53]:
sum(trained_df['target']==trained_df['pred_target']) / len(trained_df)

0.8885832187070152