# Transfromer test

In [1]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.4 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 54.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transforme

## Liberias

In [5]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List, Optional
from tensorflow.keras import models
from transformers import (DistilBertConfig,
                          DistilBertTokenizer,
                          TFDistilBertForSequenceClassification)

In [4]:
os.chdir("/content/drive/MyDrive/GovContracts")

## Encoders

In [7]:
encoder = pickle.load(open('sell2sam/encoder.pkl', 'rb'))

## Model

### Transfer learning

### Tokenizer

In [10]:
tokenizer = DistilBertTokenizer(vocab_file = 'sell2sam/vocab.txt',
                                tokenizer_file = 'sell2sam/tokenizer.json',
                                tokenizer_config = 'sell2sam/tokenizer_config.json',
                                special_tokens_map = 'sell2sam/special_tokens_map.json',
                                do_lower_case = True,
                                add_special_tokens = True,
                                max_length = 512
                                )

In [11]:
def tokenize(sentences: List[str], tokenizer):

  inputs_id, inputs_masks = list(), list()

  for sentence in tqdm(sentences):

    inputs = tokenizer.encode_plus(sentence, add_special_tokens = True,
                                   max_length = 512, return_token_type_ids = True,
                                   return_attention_mask = True,
                                   pad_to_max_length = True)
    
    inputs_id.append(inputs["input_ids"])

    inputs_masks.append(inputs['attention_mask'])

  return np.asarray(inputs_id, dtype='int32'), \
         np.asarray(inputs_masks, dtype='int32') 

### Transformer

In [14]:
config = DistilBertConfig(num_labels = 1010)

config.output_hidden_states = False

transformer_model = TFDistilBertForSequenceClassification(config = config)

In [15]:
def build_model(file: Optional[str]= "model.json"):

  loaded_model = models.load_model("sell2sam/sell2sam.h5",
                                   custom_objects = {"TFDistilBertForSequenceClassification":
                                                     transformer_model})
  
  loaded_model.load_weights("sell2sam/sell2sam_weights.h5")

  return loaded_model

model = build_model()

## Test

In [17]:
keywords = "software"

inputs_id_test, inputs_masks_inputs_id_test = tokenize([keywords],
                                                       tokenizer)

100%|██████████| 1/1 [00:00<00:00, 1069.16it/s]


In [19]:
preds = model.predict([inputs_id_test, inputs_masks_inputs_id_test])

y_preds = np.argsort(preds, axis=1)[:,-5:]

y_preds = y_preds[0]

print(f"Top 5 results: { ', '.join(encoder.inverse_transform(y_preds))}")

Top 5 results: 924110, 518210, 423430, 541511, 513210
