In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import pickle
import torch

from src.models_classical.utils import (
    under_over_sampler,
    scale_data,
    calculate_scores,
    collate_scores_binary_classification,
    get_classifier_and_params,
    get_model_metrics_df,
)

from transformers import BertModel, BertTokenizer, AutoModel

from src.visualization.visualize import plot_pr_roc_curves_kfolds
from src.features.make_bert_embeddings import (create_single_embedding, create_batch_embeddings)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
proj_dir = Path().cwd().parent.parent
print(proj_dir)

path_emb_dir = proj_dir / "data" / "processed" / "embeddings"
path_model_dir = proj_dir / "models" / "final_results_classical" / "model_files"
path_label_dir = proj_dir / "data" / "interim"

/home/tim/Documents/arxiv-code-search


In [6]:
model_file = "model_14438904_rf_2022-06-08-0020-00_papers1.pkl"
scaler_file = "scaler_14438904_rf_2022-06-08-0020-00_papers1.pkl"

# load sklearn scaler from scaler file
with open(path_model_dir / scaler_file, "rb") as f:
    scaler = pickle.load(f)

# load the model
with open(path_model_dir / model_file, "rb") as f:
    model = pickle.load(f)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

scratch_path = Path.home() / "scratch"
if scratch_path.exists():
    tokenizer = BertTokenizer.from_pretrained(proj_dir / "bert_cache_dir")
    bert_model = AutoModel.from_pretrained(proj_dir / "bert_cache_dir")
else:
    tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
    bert_model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

bert_model.to(device)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(31090, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [23]:
df = pd.read_excel(
    path_label_dir / "labels_to_not_include_in_final" / "labels_3.ods",
    parse_dates=["update_date"],
    engine="odf",
    dtype={"id": str},
    )
df.head()

Unnamed: 0,id,pattern,token_count,update_date,label,para
0,1602.06797,data,6,NaT,0.0,of data with given labels.
1,1602.06797,data,9,NaT,0.0,Guan. sparse co-occurrence data. Computer Soci...
2,1602.06797,data,14,NaT,0.0,"Figure 5: Inﬂuence of labeled data, where the ..."
3,1602.06797,data,20,NaT,0.0,"In other words, µk is equal to the mean of all..."
4,1602.06797,data,25,NaT,0.0,metric-learn-ave-vec also uses the metric lear...


In [9]:
features = create_batch_embeddings(df, bert_model, tokenizer, device)

No. para: 0
No. para: 100
No. para: 200
No. para: 300
No. para: 400
No. para: 500
No. para: 600
No. para: 700
No. para: 800
No. para: 900
No. para: 1000
No. para: 1100
No. para: 1200
No. para: 1300
No. para: 1400
No. para: 1500
No. para: 1600
No. para: 1700
No. para: 1800
No. para: 1900


In [11]:
features.shape

(1990, 768)

In [12]:
features = scaler.transform(features)

In [16]:
probabilities = model.predict_proba(features)
predictions = model.predict(features)

In [22]:
model.classes_

array([0, 1])

In [24]:
# create new column with predictions
df["prediction"] = predictions

# create new columns with probabilities, one for each probability
for i, col in enumerate(model.classes_):
    df[f"class_{col}"] = probabilities[:, i]



In [25]:
df.head()

Unnamed: 0,id,pattern,token_count,update_date,label,para,prediction,class_0,class_1
0,1602.06797,data,6,NaT,0.0,of data with given labels.,0,0.916827,0.083173
1,1602.06797,data,9,NaT,0.0,Guan. sparse co-occurrence data. Computer Soci...,0,0.950288,0.049712
2,1602.06797,data,14,NaT,0.0,"Figure 5: Inﬂuence of labeled data, where the ...",0,0.912281,0.087719
3,1602.06797,data,20,NaT,0.0,"In other words, µk is equal to the mean of all...",0,0.9576,0.0424
4,1602.06797,data,25,NaT,0.0,metric-learn-ave-vec also uses the metric lear...,0,0.849305,0.150695


In [26]:
# save df as an .ods file
df.to_excel(
    path_label_dir / "labels_to_not_include_in_final" / "labels_3_with_predictions.ods",
    engine="odf",
    index=False,
    )