# training the model

In [19]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from simpletransformers.ner import NERModel,NERArgs

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data = data.fillna(method ="ffill")

  data = data.fillna(method ="ffill")


In [10]:
data.head(54)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [11]:
data["Sentence #"] = LabelEncoder().fit_transform(data["Sentence #"] )

In [12]:
data.head(54)

Unnamed: 0,Sentence #,Word,POS,Tag
0,0,Thousands,NNS,O
1,0,of,IN,O
2,0,demonstrators,NNS,O
3,0,have,VBP,O
4,0,marched,VBN,O
5,0,through,IN,O
6,0,London,NNP,B-geo
7,0,to,TO,O
8,0,protest,VB,O
9,0,the,DT,O


In [14]:
data.rename(columns={"Sentence #":"sentence_id","Word":"words","Tag":"labels"}, inplace =True)

In [15]:
data["labels"] = data["labels"].str.upper()

In [16]:
X = data[["sentence_id","words"]]
Y = data["labels"]

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [18]:
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [20]:
label = data["labels"].unique().tolist()
label

['O',
 'B-GEO',
 'B-GPE',
 'B-PER',
 'I-GEO',
 'B-ORG',
 'I-ORG',
 'B-TIM',
 'B-ART',
 'I-ART',
 'I-PER',
 'I-GPE',
 'I-TIM',
 'B-NAT',
 'B-EVE',
 'I-EVE',
 'I-NAT']

In [21]:
args = NERArgs()
args.num_train_epochs = 1
args.learning_rate = 1e-4
args.overwrite_output_dir = True
args.train_batch_size = 32
args.eval_batch_size = 32

In [23]:
model_bert = NERModel('bert', 'bert-base-cased', labels=label, args=args)

model.safetensors: 100%|██████████| 436M/436M [00:11<00:00, 36.7MB/s] 
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
tokenizer_config.json: 100%|██████████| 49.0/49.0 [00:00<?, ?B/s]
vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 527kB/s]
tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 712kB/s]


In [24]:
model_bert.train_model(train_data, eval_data = test_data, acc=accuracy_score)

100%|██████████| 29/29 [00:25<00:00,  1.16it/s]
Epochs 1/1. Running Loss:    0.1782: 100%|██████████| 1499/1499 [11:08<00:00,  2.24it/s]
Epoch 1 of 1: 100%|██████████| 1/1 [11:11<00:00, 671.75s/it]


(1499, 0.1918389263592059)

In [25]:
result, model_outputs, preds_list = model_bert.eval_model(test_data)
result

100%|██████████| 29/29 [00:22<00:00,  1.27it/s]
Running Evaluation: 100%|██████████| 1461/1461 [05:16<00:00,  4.62it/s]


{'eval_loss': 0.17083822267256232,
 'precision': 0.8248486516235554,
 'recall': 0.7668937286130033,
 'f1_score': 0.7948161283372831}

# multimodal data preprocessing

### text extraction

In [31]:
import easyocr
reader = easyocr.Reader(['en']) # english

Downloading detection model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

In [32]:
extracted = reader.readtext('img.png')

In [33]:
extract = ""

for (bbox, text, prob) in extracted:
    extract = extract + text
    extract = extract + " "

In [34]:
extract

'"Our greatest weakness lies in giving up: The most certain way to succeed is always to try just one more time:\' Thomas Edison Forbes '

### image captioning

In [35]:
import matplotlib.pyplot as plt
import keras_ocr
import cv2
import math
import numpy as np

In [36]:
def midpoint(x1, y1, x2, y2):
    x_mid = int((x1 + x2)/2)
    y_mid = int((y1 + y2)/2)
    return (x_mid, y_mid)

In [37]:
def inpaint_text(img_path, pipeline):
    img = keras_ocr.tools.read(img_path)
    prediction_groups = pipeline.recognize([img])

    mask = np.zeros(img.shape[:2], dtype="uint8")
    for box in prediction_groups[0]:
        x0, y0 = box[1][0]
        x1, y1 = box[1][1]
        x2, y2 = box[1][2]
        x3, y3 = box[1][3]

        x_mid0, y_mid0 = midpoint(x1, y1, x2, y2)
        x_mid1, y_mi1 = midpoint(x0, y0, x3, y3)

        thickness = int(math.sqrt( (x2 - x1)**2 + (y2 - y1)**2 ))

        cv2.line(mask, (x_mid0, y_mid0), (x_mid1, y_mi1), 255,
        thickness)
        inpainted_img = cv2.inpaint(img, mask, 7, cv2.INPAINT_NS)

    return(inpainted_img)

In [39]:
pipeline = keras_ocr.pipeline.Pipeline()

image_textless = inpaint_text('img.png', pipeline)

Looking for C:\Users\varun\.keras-ocr\craft_mlt_25k.h5
Looking for C:\Users\varun\.keras-ocr\crnn_kurapan.h5
Downloading C:\Users\varun\.keras-ocr\crnn_kurapan.h5


In [40]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image

In [60]:
model_VED = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_VED.to(device)



VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [61]:
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

In [64]:
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.fromarray(image_path) # returns a numpy array
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model_VED.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds

In [65]:
caption = predict_step([image_textless])

In [66]:
caption

['a man wearing a suit and tie']

In [67]:
text = "This guy used to rule America in the 1920s"

# NER

In [49]:
multimodal = text + " " + caption[0] + " " + extract

In [50]:
multimodal

'This guy used to rule America in the 1920s a man wearing a suit and tie "Our greatest weakness lies in giving up: The most certain way to succeed is always to try just one more time:\' Thomas Edison Forbes '

DEPRECATION: pyraftlog 3.0.0 has a non-standard dependency specifier msgpack>="0.6.1". pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyraftlog or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063
DEPRECATION: pyraftlog 3.0.0 has a non-standard dependency specifier redis>="3.0.0". pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyraftlog or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     -------------------------------------- 0.0/587.7 MB 393.8 kB/s eta 0:24:53
     ---------------------------------------- 0.2/587.7 MB 1.6 MB/s eta 0:06:18
     ---------------------------------------- 0.7/587.7 MB 3.7 MB/s eta 0:02:40
     ---------------------------------------- 2.0/587.7 MB 9.2 MB/s eta 0:01:04
     --------------------------------------- 3.6/587.7 MB 12.9 MB/s eta 0:00:46
     --------------------------------------- 5.3/587.7 MB 16.2 MB/s eta 0:00:36
     --------------------------------------- 7.0/587.7 MB 18.6 MB/s eta 0:00:32
      -------------------------------------- 8.3/587.7 MB 20.5 MB/s eta 0:00:29
      -------------------------------

### NER using BERT

In [None]:
prediction, model_output = model_bert.predict([multimodal])

In [69]:
prediction

[[{'This': 'O'},
  {'guy': 'O'},
  {'used': 'O'},
  {'to': 'O'},
  {'rule': 'O'},
  {'America': 'B-GEO'},
  {'in': 'O'},
  {'the': 'O'},
  {'1920s': 'B-TIM'},
  {'a': 'O'},
  {'man': 'O'},
  {'wearing': 'O'},
  {'a': 'O'},
  {'suit': 'O'},
  {'and': 'O'},
  {'tie': 'O'},
  {'"Our': 'O'},
  {'greatest': 'O'},
  {'weakness': 'O'},
  {'lies': 'O'},
  {'in': 'O'},
  {'giving': 'O'},
  {'up:': 'O'},
  {'The': 'O'},
  {'most': 'O'},
  {'certain': 'O'},
  {'way': 'O'},
  {'to': 'O'},
  {'succeed': 'O'},
  {'is': 'O'},
  {'always': 'O'},
  {'to': 'O'},
  {'try': 'O'},
  {'just': 'O'},
  {'one': 'O'},
  {'more': 'O'},
  {"time:'": 'O'},
  {'Thomas': 'B-PER'},
  {'Edison': 'I-PER'},
  {'Forbes': 'I-PER'}]]

### NER using spacy

In [51]:
import re
import spacy
from spacy import displacy

In [52]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


DEPRECATION: pyraftlog 3.0.0 has a non-standard dependency specifier msgpack>="0.6.1". pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyraftlog or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063
DEPRECATION: pyraftlog 3.0.0 has a non-standard dependency specifier redis>="3.0.0". pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyraftlog or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [53]:
ner = spacy.load("en_core_web_lg")
ner

<spacy.lang.en.English at 0x1eb3ea1ae50>

In [54]:
doc = ner(multimodal)

In [55]:
doc.ents

(America, the 1920s, Thomas Edison)

In [56]:
displacy.render(doc, style="dep", jupyter=True)

In [57]:
displacy.render(doc, style="ent", jupyter=True)

In [58]:
entities = [(ent.text, ent.label_) for ent in doc.ents]
entities

[('America', 'GPE'), ('the 1920s', 'DATE'), ('Thomas Edison', 'PERSON')]