In [1]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer, TFBertModel
import zipfile
from docx import Document
from PyPDF2 import PdfReader
import textract
import os
from docx import Document
import subprocess
from tensorflow.keras.layers import Input, Dense, Concatenate, Layer 
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Lambda
from tensorflow import keras
from tensorflow.keras.layers import Dropout

2024-05-29 07:33:10.692931: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# File paths 
tfrecord_file = '/home/vignes/Patent_Files/tfrecords/Dataset_9.tfrecord'
patent_folder = '/home/vignes/Patent_Files/Mount_Patents/'
standard_folder = '/home/vignes/Patent_Files/Mount_Std/'

In [3]:
strategy = tf.distribute.MirroredStrategy()
print("Number of devices: {}".format(strategy.num_replicas_in_sync))

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
Number of devices: 2


2024-05-29 07:33:13.690085: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-29 07:33:13.691106: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-29 07:33:13.722727: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [4]:
# Define the batch size and accumulation steps
batch_size_per_replica = 8
global_batch_size = batch_size_per_replica * strategy.num_replicas_in_sync
accumulation_steps = 4

In [5]:
global_batch_size

16

In [6]:
bert_model_name = 'bert-base-uncased'
bert_model = TFBertModel.from_pretrained(bert_model_name)
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [7]:
def extract_patent_text(patent_file):
    patent_text = ""
    pdf_file = patent_file + ".pdf"
    if os.path.isfile(pdf_file):
        try:
            with open(pdf_file, 'rb') as file:
                pdf_reader = PdfReader(file)
                for page in pdf_reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        patent_text += page_text
        except Exception as e:
            print(f"Error reading {pdf_file}: {e}")
            return ""  # Return an empty string instead of None
    else:
        print(f"File not found: {pdf_file}")
        return ""  # Return an empty string if the file is not found
    return patent_text

def extract_docx_text(docx_file):
    try:
        doc = Document(docx_file)
        return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
    except Exception as e:
        print(f"Exception during DOCX text extraction: {e}")
    return ""

def extract_doc_text(doc_file):
    try:
        command = f"antiword '{doc_file}'"
        result = subprocess.run(command, shell=True, capture_output=True, text=True)
        if result.returncode == 0:
            return result.stdout
        else:
            raise Exception(f"antiword failed: {result.stderr}")
    except Exception as e:
        print(f"Exception during DOC text extraction: {e}")
    return ""

def extract_text(file_path):
    if file_path.endswith('.docx'):
        return extract_docx_text(file_path)
    elif file_path.endswith('.doc'):
        return extract_doc_text(file_path)
    else:
        return ""

def ensure_directory_path(directory_path):
    if not directory_path.endswith('/'):
        directory_path += '/'
    return directory_path

def extract_standard_text(standard_dir):
    standard_text = ""
    standard_dir = ensure_directory_path(os.path.splitext(standard_dir)[0])  # Remove the .zip extension
    if os.path.exists(standard_dir):
        for root, _, files in os.walk(standard_dir):
            for file in files:
                file_path = os.path.join(root, file)
                text = extract_text(file_path)
                standard_text += text + "\n"
    else:
        print(f"Standard directory {standard_dir} does not exist.")
    return standard_text

In [8]:
def parse_tfrecord(example):
    feature_description = {
        'patent': tf.io.FixedLenFeature([], tf.string),
        'standard': tf.io.FixedLenFeature([], tf.string),
        'match': tf.io.FixedLenFeature([], tf.int64),
    }
    example = tf.io.parse_single_example(example, feature_description)
    return example['patent'], example['standard'], example['match']

In [9]:
# Load the TFRecord dataset
dataset = tf.data.TFRecordDataset(tfrecord_file)
parsed_dataset = dataset.map(parse_tfrecord)

In [10]:
num_samples = 0
for _ in parsed_dataset:
    num_samples += 1

print(f"Total number of samples in the dataset: {num_samples}")

train_ratio = 0.8
train_size = int(train_ratio * num_samples)
val_size = num_samples - train_size

print(f"Number of samples in the training set: {train_size}")
print(f"Number of samples in the validation set: {val_size}")

Total number of samples in the dataset: 130476
Number of samples in the training set: 104380
Number of samples in the validation set: 26096


2024-05-29 07:33:41.713392: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [11]:
# Split the dataset into training and validation sets
train_dataset = parsed_dataset.take(train_size)
val_dataset = parsed_dataset.skip(train_size)

In [12]:
train_samples = 0
for _ in train_dataset:
    train_samples += 1

val_samples = 0
for _ in val_dataset:
    val_samples += 1

print(f"Number of samples in the training dataset: {train_samples}")
print(f"Number of samples in the validation dataset: {val_samples}")

2024-05-29 07:33:56.510380: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Number of samples in the training dataset: 104380
Number of samples in the validation dataset: 26096


2024-05-29 07:34:02.004492: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [13]:
max_length = 512
train_dataset_reduced = train_dataset.take(5000)

In [14]:
def chunk_text(text, max_length):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_length):
        chunk = words[i:i + max_length]
        chunk = chunk + [''] * (max_length - len(chunk))  # Pad with empty strings
        chunks.append(' '.join(chunk))
    return chunks

In [17]:
document_path = '/home/vignes/Patent_Files/Mount_Patents/US20140029537A1'  

max_length = 512


document_text = extract_patent_text(document_path)

chunked_text = chunk_text(document_text, max_length)

# Print the resulting chunks
for i, chunk in enumerate(chunked_text):
    print(f"Chunk {i + 1}: {chunk}")

Chunk 1: (19) United States US 2014002953TA1 (12) Patent Application Publication (10) Pub. No.: US 2014/0029537 A1 Golitschek Edler Von Ebwart et al. (43) Pub. Date: Jan. 30, 2014 (54) RESOURCE ASSIGNMENT FOR SINGLE AND MULTIPLE CLUSTER TRANSMISSION (75) Alexander Golitschek Edler von Elbwart, Darmstadt (DE); Akihiko Nishio, Hirakata (JP) Inventors: (73) Assignee: PANASONIC CORPORATION, Osaka (JP) (21) 14/006,095 (22) Appl. No.: PCT Fled: Mar. 8, 2012 (86). PCT No.: S371 (c)(1), (2), (4) Date: PCT/EP2012/001039 Oct. 14, 2013 (30) Foreign Application Priority Data Mar. 23, 2011 (EP) .............................. EP11159463.6 Publication Classification (51) Int. Cl. H047 72/04 (2006.01) One downlink slot T. slot (52) U.S. Cl. CPC ................................... H04W 72/042 (2013.01) USPC .......................................................... 370/329 (57) ABSTRACT This invention concerns concepts for signaling resource allo cation information to a terminal that indicates to the t

In [18]:
total_patent_chunks = len(chunked_text)
total_patent_chunks

31

In [19]:
# Print the resulting chunks and their lengths
for i, chunk in enumerate(chunked_text):
    chunk_length = len(chunk.split())
    print(f"Chunk {i + 1}: Length = {chunk_length}")
    print(chunk)
    print()

Chunk 1: Length = 512
(19) United States US 2014002953TA1 (12) Patent Application Publication (10) Pub. No.: US 2014/0029537 A1 Golitschek Edler Von Ebwart et al. (43) Pub. Date: Jan. 30, 2014 (54) RESOURCE ASSIGNMENT FOR SINGLE AND MULTIPLE CLUSTER TRANSMISSION (75) Alexander Golitschek Edler von Elbwart, Darmstadt (DE); Akihiko Nishio, Hirakata (JP) Inventors: (73) Assignee: PANASONIC CORPORATION, Osaka (JP) (21) 14/006,095 (22) Appl. No.: PCT Fled: Mar. 8, 2012 (86). PCT No.: S371 (c)(1), (2), (4) Date: PCT/EP2012/001039 Oct. 14, 2013 (30) Foreign Application Priority Data Mar. 23, 2011 (EP) .............................. EP11159463.6 Publication Classification (51) Int. Cl. H047 72/04 (2006.01) One downlink slot T. slot (52) U.S. Cl. CPC ................................... H04W 72/042 (2013.01) USPC .......................................................... 370/329 (57) ABSTRACT This invention concerns concepts for signaling resource allo cation information to a terminal that indic

In [20]:
total_chunks = len(chunked_text)
print(f"Total number of chunks: {total_chunks}")

Total number of chunks: 31


In [23]:
document_path = '/home/vignes/Patent_Files/Mount_Std/TS 125 321 v10.5.0/'  

max_length = 512


document_text = extract_standard_text(document_path)

chunked_text = chunk_text(document_text, max_length)

# Print the resulting chunks
for i, chunk in enumerate(chunked_text):
    print(f"Chunk {i + 1}: {chunk}")

Chunk 1: 3GPP TS 25.321 V10.5.0 (2011-12) Technical Specification 3rd Generation Partnership Project; Technical Specification Group Radio Access Network; Medium Access Control (MAC) protocol specification (Release 10) [pic] The present document has been developed within the 3rd Generation Partnership Project (3GPP TM) and may be further elaborated for the purposes of 3GPP. The present document has not been subject to any approval process by the 3GPP Organisational Partners and shall not be implemented. This Specification is provided for future development work within 3GPP only. The Organisational Partners accept no liability for any use of this Specification. Specifications and reports for implementation of the 3GPP TM system should be obtained via the 3GPP Organisational Partners' Publications Offices. Keywords UMTS, radio 3GPP Postal address 3GPP support office address 650 Route des Lucioles - Sophia Antipolis Valbonne - FRANCE Tel.: +33 4 92 94 42 00 Fax: +33 4 93 65 47 16 Internet 

In [24]:
total_patent_chunks = len(chunked_text)
total_patent_chunks

83