In [1]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Convert pdf to text

import os
import PyPDF2

source_folder = '/content/drive/My Drive'
combined_txt_path = '/content/drive/My Drive/text1.txt'

# Create a list to store text from all PDF files
all_pdf_text = []

# Loop through PDF files in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(source_folder, filename)

        # Extract text from PDF
        with open(pdf_path, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            pdf_text = ' '.join(page.extract_text() for page in pdf_reader.pages)
            all_pdf_text.append(pdf_text)

# Combine all PDF text into a single string
combined_text = '\n\n'.join(all_pdf_text)  # Adding double newline separator

# Save combined text to a single TXT file
with open(combined_txt_path, 'w', encoding='utf-8') as combined_txt_file:
    combined_txt_file.write(combined_text)

In [None]:
!apt install ocrmypdf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ocrmypdf is already the newest version (13.4.0+dfsg-1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
# Convert pdf to text using ocr
!ocrmypdf --sidecar /content/drive/My\ Drive/sidecar.txt --output-type none /content/drive/My\ Drive/wce.pdf - > stdout.txt

Scanning contents:   0% 0/33 [00:00<?, ?page/s]Scanning contents:  82% 27/33 [00:00<00:00, 266.64page/s]Scanning contents: 100% 33/33 [00:00<00:00, 286.88page/s]
Start processing 2 pages concurrently
    8 [tesseract] lots of diacritics - possibly poor OCR
   27 [tesseract] lots of diacritics - possibly poor OCR
OCR: 100% 33.0/33.0 [01:34<00:00,  2.86s/page]
Output sent to stdout


In [29]:
!python -m spacy init fill-config "/content/drive/My Drive/train_data/base_config.cfg" "/content/drive/My Drive/train_data/config/config.cfg"

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/drive/My Drive/train_data/config/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [30]:
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(file, data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('en')
  db = DocBin()

  # Iterate through the data
  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    # Extract entities from the annotations
    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity:
        continue

      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        # Log errors for annotations that couldn't be processed
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

In [35]:
import os
# Load the annotated training data from a JSON file
directory = '/content/drive/My Drive/train_data/train_json'
num = 1
for filename in os.listdir(directory):
  if filename.endswith('.json'):
    filepath = os.path.join(directory, filename)
    cv_data = json.load(open(filepath,'r'))
    file = open('/content/drive/My Drive/train_data/train_file.txt','w')
    db = get_spacy_doc(file, cv_data)
    db.to_disk(f'/content/drive/My Drive/train_spacy/train_data{num}.spacy')
    num += 1
    file.close()

100%|██████████| 1/1 [00:00<00:00,  9.08it/s]
100%|██████████| 1/1 [00:00<00:00,  5.51it/s]
100%|██████████| 1/1 [00:00<00:00,  9.95it/s]
100%|██████████| 1/1 [00:00<00:00,  6.88it/s]
100%|██████████| 1/1 [00:00<00:00,  7.05it/s]
100%|██████████| 1/1 [00:00<00:00,  7.07it/s]
100%|██████████| 1/1 [00:00<00:00,  3.88it/s]
100%|██████████| 1/1 [00:00<00:00,  4.26it/s]
100%|██████████| 1/1 [00:00<00:00,  2.89it/s]
100%|██████████| 1/1 [00:00<00:00,  3.39it/s]
100%|██████████| 1/1 [00:00<00:00,  3.40it/s]
100%|██████████| 1/1 [00:00<00:00,  4.10it/s]
100%|██████████| 1/1 [00:00<00:00,  6.60it/s]
100%|██████████| 1/1 [00:00<00:00,  4.73it/s]
100%|██████████| 1/1 [00:00<00:00, 14.25it/s]
100%|██████████| 1/1 [00:00<00:00,  7.00it/s]
100%|██████████| 1/1 [00:00<00:00,  7.21it/s]
100%|██████████| 1/1 [00:00<00:00,  4.94it/s]
100%|██████████| 1/1 [00:00<00:00, 15.49it/s]
100%|██████████| 1/1 [00:00<00:00,  9.59it/s]
100%|██████████| 1/1 [00:00<00:00, 14.28it/s]
100%|██████████| 1/1 [00:00<00:00,

In [37]:
import os
# Load the annotated validation data from a JSON file
directory = '/content/drive/My Drive/train_data/val_json'
num = 1
for filename in os.listdir(directory):
  if filename.endswith('.json'):
    filepath = os.path.join(directory, filename)
    cv_data = json.load(open(filepath,'r'))
    file = open('/content/drive/My Drive/train_data/train_file.txt','w')
    db = get_spacy_doc(file, cv_data)
    db.to_disk(f'/content/drive/My Drive/val_spacy/val_data{num}.spacy')
    num += 1
    file.close()

100%|██████████| 1/1 [00:00<00:00,  9.42it/s]
100%|██████████| 1/1 [00:00<00:00,  8.70it/s]
100%|██████████| 1/1 [00:00<00:00, 13.25it/s]
100%|██████████| 1/1 [00:00<00:00,  3.54it/s]
100%|██████████| 1/1 [00:00<00:00,  7.86it/s]


In [27]:
!pip install -U spacy
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [38]:
# Train spaCy NER model
!python -m spacy train "/content/drive/My Drive/train_data/config2/config (1).cfg" \
    --output "/content/drive/My Drive/train_data/output" \
    --paths.train "/content/drive/My Drive/train_spacy" \
    --paths.dev "/content/drive/My Drive/val_spacy"

[38;5;4mℹ Saving to output directory: /content/drive/My
Drive/train_data/output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00   4022.17    0.00    0.00    0.00    0.00
  6     200      20445.83  66739.40   37.50   42.86   33.33    0.38
 12     400       5210.18   1782.45   65.62   75.00   58.33    0.66
 18     600        139.50    470.14   83.58   90.32   77.78    0.84
 25     800        192.82    348.93   72.00   69.23   75.00    0.72
 31    1000        896.61    361.67   77.97  100.00   63.89    0.78
 37    1200        170.81    145.01   64.79   65.71   63.89    0.65
 43    1400        323.33    177.78   82.54   96.30   72.22    0.83
 50    1600        335.54    155.67   80.56   80.56   80.56    0.81
 56    1800

In [None]:
# install any necessary packages
!pip install fitz
!pip install PyMuPDF

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl (20 kB)
Collecting configobj (from fitz)
  Downloading configobj-5.0.8-py2.py3-none-any.whl (36 kB)
Collecting configparser (from fitz)
  Downloading configparser-6.0.1-py3-none-any.whl (19 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.8.6-py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.2-py3-none-any.whl (95 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.6/95.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.0-py3-none-any.whl (421 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.5/421.5 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Collecting rdflib>=5.0.0 (from nipype->fitz)
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.0-cp310-none-manylinux2014_x86_64.whl (3.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.0 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.8/30.8 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.0 PyMuPDFb-1.24.0


In [None]:
# Import the spaCy library
import spacy

# Load the trained spaCy NER model from the specified path
nlp = spacy.load('/content/drive/My Drive/train_data/output/model-best')

# Import necessary libraries for PDF processing
import sys
import fitz

# Specify the path to the PDF file
fname = '/content/drive/My Drive/train_data/wce.pdf'

# Open the PDF document using PyMuPDF (fitz)
doc = fitz.open(fname)

# Initialize an empty string to store the extracted text from the PDF
text = " "

# Iterate through each page in the PDF and concatenate the text
for page in doc:
  text = text + str(page.get_text())

# Display the extracted text
print(text)

  
 
 
 
 
 
 
                           
 
 
KUMPULAN EUROPLUS BERHAD 
(Company No. 534368 – A) 
(Incorporated in Malaysia) 
 
 
 
REPORTS AND FINANCIAL STATEMENTS 
31 MARCH 2015 
 
 
Company No. 534368 – A 
 
 
 
 
KUMPULAN EUROPLUS BERHAD 
(Incorporated in Malaysia) 
 
 
REPORTS AND FINANCIAL STATEMENTS 
FOR THE FINANCIAL YEAR ENDED 31 MARCH 2015 
 
 
CONTENTS 
PAGE 
 
 
 
 
DIRECTORS’ REPORT  
1 – 6 
 
 
 
 
FINANCIAL STATEMENTS  
 
 
 
STATEMENTS OF FINANCIAL POSITION 
7 – 8 
 
 
STATEMENTS OF PROFIT OR LOSS AND  
  OTHER COMPREHENSIVE INCOME 
9 – 10 
 
 
STATEMENTS OF CHANGES IN EQUITY 
11 – 13 
 
 
STATEMENTS OF CASH FLOWS 
14 – 17 
 
 
NOTES TO THE FINANCIAL STATEMENTS 
18 – 103 
 
 
 
 
SUPPLEMENTARY INFORMATION ON THE BREAKDOWN OF  
  REALISED AND UNREALISED PROFITS OR LOSSES 
104 
 
 
 
 
STATEMENT BY DIRECTORS 
105 
 
 
 
 
STATUTORY DECLARATION 
106 
 
 
 
 
INDEPENDENT AUDITORS’ REPORT 
107 – 109 
 
 
Company No. 534368 – A 
 
 
1 
 
 
KUMPULAN EUROPLUS BERHAD 
(Incorpor

In [None]:
# Process the extracted text using the loaded spaCy NER model
doc = nlp(text)

# Iterate through the named entities (entities) recognized by the model
for ent in doc.ents:
  # Print the recognized text and its corresponding label
  print(ent.text, "  ->>>>  ", ent.label_)

31 MARCH 2015   ->>>>   FINANCIAL YEAR END
31 MARCH 2015   ->>>>   FINANCIAL YEAR END
TOTAL ASSETS
755,333   ->>>>   TOTAL ASSET
Loans and borrowings
19
6,416   ->>>>   BORROWINGS
Revenue
20
   ->>>>   REVENUE
Profit/(loss) before taxation
23
40,485   ->>>>   PROFIT LOSS BEFORE TAX
Profit/(loss) before taxation 
40,485   ->>>>   PROFIT LOSS BEFORE TAX
52,500
      
   ->>>>   BORROWINGS
Revenue from construction is recognised based on the stage of 
completion method as described in Note 2.3(g)   ->>>>   REVENUE
Revenue is recognised upon delivery of products and customers’ 
acceptance, net of sales tax, discounts and returns and when the 
significant risk and rewards of ownership have been passed to the 
buyer. 
Company No. 534368 – A 
 
 
   ->>>>   REVENUE
Revenue
10,065   ->>>>   REVENUE
Revenue
-
                    
12,216
           
   ->>>>   REVENUE
Revenue
-
                    
472,964   ->>>>   REVENUE
Revenue
216,717   ->>>>   REVENUE
Cash and bank balances
7,471   ->>>>  