In [1]:
# Tensorflow backends 
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer, TFBertModel
from tensorflow.keras.layers import Input, Dense, Concatenate, Layer, GlobalAveragePooling1D, Reshape, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Lambda
from tensorflow import keras

# Torch backends
import torch

# System essentials 
import chardet 
import subprocess
import json
import time 
import re
import zipfile
from docx import Document
from PyPDF2 import PdfReader
import textract
import os
from docx import Document
import subprocess
import numpy as np
from typing import List, Union
from sklearn.metrics.pairwise import cosine_similarity

2024-08-07 18:31:39.065422: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# File Paths
tfrecord_file = '/home/vignes/Patent_Files/tfrecords/Dataset_json.tfrecord'
patent_folder = '/home/vignes/Patent_Files/zip-Patents/'
standard_folder = '/home/vignes/Patent_Files/Mount_Std/'

In [3]:
bert_model_name = 'bert-base-uncased'
bert_model = TFBertModel.from_pretrained(bert_model_name)
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

2024-08-07 18:31:43.801708: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-08-07 18:31:43.802719: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-08-07 18:31:43.804640: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [22]:
def split_into_paragraphs(text):
    # Split by double newlines or other paragraph indicators
    paragraphs = re.split(r'\n\n|\r\n\r\n|\n\s*\n', text)
    # Remove empty paragraphs and strip whitespace
    return [p.strip() for p in paragraphs if p.strip()]

def extract_patent_text(patent_file):
    patent_text = ""
    json_file = patent_file + ".json"
    if os.path.isfile(json_file):
        try:
            with open(json_file, 'r', encoding='utf-8') as file:
                data = json.load(file)
                if isinstance(data, dict) and 'text' in data:
                    patent_text = data['text']
                elif isinstance(data, str):
                    patent_text = data
        except Exception as e:
            print(f"Error reading {json_file}: {e}")
    else:
        print(f"File not found: {json_file}")
    
    return split_into_paragraphs(patent_text)

def extract_docx_text(docx_file):
    try:
        doc = Document(docx_file)
        return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
    except Exception as e:
        print(f"Exception during DOCX text extraction: {e}")
    return ""

def extract_doc_text(doc_file):
    try:
        command = f"antiword '{doc_file}'"
        result = subprocess.run(command, shell=True, capture_output=True, text=True)
        if result.returncode == 0:
            return result.stdout
        else:
            raise Exception(f"antiword failed: {result.stderr}")
    except Exception as e:
        print(f"Exception during DOC text extraction: {e}")
    return ""

def extract_text(file_path):
    if file_path.endswith('.docx'):
        return extract_docx_text(file_path)
    elif file_path.endswith('.doc'):
        return extract_doc_text(file_path)
    else:
        return ""

def ensure_directory_path(directory_path):
    if not directory_path.endswith('/'):
        directory_path += '/'
    return directory_path

def extract_standard_text(standard_dir):
    standard_text = ""
    standard_dir = ensure_directory_path(os.path.splitext(standard_dir)[0])
    if os.path.exists(standard_dir):
        for root, _, files in os.walk(standard_dir):
            for file in files:
                file_path = os.path.join(root, file)
                text = extract_text(file_path)
                standard_text += text + "\n\n"
    else:
        print(f"Standard directory {standard_dir} does not exist.")
    
    return split_into_paragraphs(standard_text)

In [9]:
def parse_tfrecord(example):
    feature_description = {
        'patent': tf.io.FixedLenFeature([], tf.string),
        'standard': tf.io.FixedLenFeature([], tf.string),
        'match': tf.io.FixedLenFeature([], tf.int64),
    }
    example = tf.io.parse_single_example(example, feature_description)
    return example['patent'], example['standard'], example['match']

In [10]:
# Load the TFRecord dataset
dataset = tf.data.TFRecordDataset(tfrecord_file)
parsed_dataset = dataset.map(parse_tfrecord)

In [11]:
num_samples = 0
for _ in parsed_dataset:
    num_samples += 1

print(f"Total number of samples in the dataset: {num_samples}")

train_ratio = 0.8
train_size = int(train_ratio * num_samples)
val_size = num_samples - train_size

print(f"Number of samples in the training set: {train_size}")
print(f"Number of samples in the validation set: {val_size}")

Total number of samples in the dataset: 130476
Number of samples in the training set: 104380
Number of samples in the validation set: 26096


2024-08-07 18:37:10.154809: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [12]:
# Split the dataset into training and validation sets
train_dataset = parsed_dataset.take(train_size)
val_dataset = parsed_dataset.skip(train_size)

In [13]:
def chunk_text(text, max_length):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_length):
        chunk = words[i:i + max_length]
        chunk = chunk + [''] * (max_length - len(chunk))  # Pad with empty strings
        chunks.append(' '.join(chunk))
    return chunks 

In [20]:
def dynamic_chunk_text(text, min_chunks=3, max_chunks=10, min_words_per_chunk=50):
    # Check if input is a string or a list
    if isinstance(text, str):
        words = text.split()
    elif isinstance(text, list):
        words = text
    else:
        raise ValueError("Input must be a string or a list of words")
    
    total_words = len(words)
    
    # Determine the optimal number of chunks
    if total_words <= min_chunks * min_words_per_chunk:
        num_chunks = max(1, total_words // min_words_per_chunk)
    elif total_words >= max_chunks * min_words_per_chunk:
        num_chunks = max_chunks
    else:
        num_chunks = total_words // min_words_per_chunk
    
    # Ensure num_chunks is within the specified range
    num_chunks = max(min_chunks, min(num_chunks, max_chunks))

    # Calculate the base chunk size and remainder
    base_chunk_size = total_words // num_chunks
    remainder = total_words % num_chunks
    
    chunks = []
    start = 0
    
    for i in range(num_chunks):
        # Add one extra word to some chunks to distribute the remainder
        end = start + base_chunk_size + (1 if i < remainder else 0)
        chunk = ' '.join(words[start:end]) if isinstance(text, str) else words[start:end]
        chunks.append(chunk)
        start = end
    
    return chunks, len(chunks)

# Example usage
fileread = '/home/vignes/Patent_Files/zip-Patents/US20180014294'
text = extract_patent_text(fileread)
print("Total words:", len(text.split()) if isinstance(text, str) else len(text))
print("First few elements:", text[:5] if isinstance(text, list) else text[:100])

result, chunk_count = dynamic_chunk_text(text, min_chunks=3, max_chunks=20, min_words_per_chunk=100)
print(f"Number of chunks: {chunk_count}")
for i, chunk in enumerate(result, 1):
    print(f"Chunk {i}: {chunk[:100] if isinstance(chunk, str) else chunk[:5]}...")  # Print first 100 chars or 5 elements

Total words: 4
First few elements: ['System and Method for Resource Block-Specific Control Signaling\nAbstract\nA system and method is disclosed for resource block-specific control signaling in a communication system. Communication data is transmitted using a transmission channel comprising a plurality of resource blocks defined by allocating time-frequency slots in a transmission resource. Resource block control information is transmitted in a “feed-forward” manner to a user end (UE) or group of UEs using channels physically mapped into scheduled resource blocks (RBs) for that user or group of users. Embodiments of the invention provide an RB-specific control channel that comprises RB control elements that are embedded within scheduled resource blocks. The invention, therefore, reduces the amount of control information that must be transmitted by common or shared control channels.\nImages (7)\nClassifications', 'H04W72/0406\nView 6 more classifications\nUS20180014294A1\nUnited States\

In [28]:


class SemanticTextChunker:
    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2", 
                 breakpoint_threshold_type: str = "percentile",
                 threshold: float = 0.75):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.breakpoint_threshold_type = breakpoint_threshold_type
        self.threshold = threshold

    def get_embedding(self, text: str) -> np.ndarray:
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).numpy()[0]

    def split_text(self, paragraphs: List[str]) -> List[str]:
        embeddings = [self.get_embedding(paragraph) for paragraph in paragraphs]
        
        similarities = cosine_similarity(embeddings)
        differences = 1 - similarities

        if self.breakpoint_threshold_type == "percentile":
            breakpoint_threshold = np.percentile(differences, self.threshold * 100)
        elif self.breakpoint_threshold_type == "standard_deviation":
            breakpoint_threshold = np.mean(differences) + self.threshold * np.std(differences)
        elif self.breakpoint_threshold_type == "interquartile":
            q1, q3 = np.percentile(differences, [25, 75])
            iqr = q3 - q1
            breakpoint_threshold = q3 + self.threshold * iqr
        else:
            raise ValueError("Invalid breakpoint_threshold_type")

        chunks = []
        current_chunk = [paragraphs[0]]

        for i in range(1, len(paragraphs)):
            if differences[i-1][i] > breakpoint_threshold:
                chunks.append('\n\n'.join(current_chunk))
                current_chunk = [paragraphs[i]]
            else:
                current_chunk.append(paragraphs[i])

        chunks.append('\n\n'.join(current_chunk))
        return chunks

def extract_patent_text(patent_file):
    patent_text = ""
    json_file = patent_file + ".json"
    if os.path.isfile(json_file):
        try:
            with open(json_file, 'r', encoding='utf-8') as file:
                data = json.load(file)
                if isinstance(data, dict) and 'text' in data:
                    patent_text = data['text']
                elif isinstance(data, str):
                    patent_text = data
        except Exception as e:
            print(f"Error reading {json_file}: {e}")
    else:
        print(f"File not found: {json_file}")
    
    return split_into_paragraphs(patent_text)

def split_into_paragraphs(text):
    # This is a placeholder. Replace with your actual implementation
    return text.split('\n\n')

file_path = '/home/vignes/Patent_Files/zip-Patents/US20180026824'
paragraphs = extract_patent_text(file_path)

chunker = SemanticTextChunker(breakpoint_threshold_type="percentile", threshold=0.75)
chunks = chunker.split_text(paragraphs)

# print(f"Number of chunks: {len(chunks)}")
# for i, chunk in enumerate(chunks, 1):
#     print(f"\nChunk {i} (length: {len(chunk)}):")
#     print(chunk[:100] + "..." + chunk[-100:] if len(chunk) > 200 else chunk)

Number of chunks: 2

Chunk 1 (length: 794):
Adaptive Time Diversity And Spatial Diversity For OFDM
Abstract
An adaptable orthogonal frequency-di...atrix and a ratio of the largest and smallest eigen values of the matrix.
Images (8)
Classifications

Chunk 2 (length: 52440):


H04L27/2646 Arrangements specific to the transmitter only using feedback from receiver for adjusti... failure to pay maintenance fee
Effective date: 20230903
Data provided by IFI CLAIMS Patent Services


In [31]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(chunks)
print(embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[[ 2.20199004e-02 -1.76723804e-02 -6.03800155e-02 -4.56724241e-02
  -4.10895906e-02  4.94031981e-02 -5.04542002e-03 -3.46747860e-02
   7.55211851e-03 -1.76970549e-02 -2.93651819e-02  7.88685903e-02
   2.04386488e-02  3.79551910e-02 -6.04382493e-02 -2.83684004e-02
   2.62689646e-02 -1.70277071e-03 -2.04865145e-03 -5.76549815e-03
   7.61474669e-02 -1.29880253e-02 -3.36127467e-02  4.99258488e-02
  -7.77339982e-03 -9.11819562e-02  5.93825802e-02  4.46539093e-03
  -5.48449287e-04 -5.82571607e-04  2.45136232e-03  1.12033710e-01
   9.54284221e-02 -6.61975145e-02 -1.00323804e-01  1.77292014e-03
   3.96532230e-02  2.11837534e-02  1.08097240e-01  1.15370668e-01
  -4.49628122e-02  1.82478204e-02  1.30060194e-02  8.40304494e-02
  -5.77078536e-02 -6.51921704e-02 -3.35158370e-02 -9.96775553e-02
  -2.25151610e-02 -2.40419842e-02  1.13361456e-01  8.66319314e-02
  -6.55503869e-02 -3.79325859e-02 -6.74614161e-02 -4.93604243e-02
  -4.24007513e-02  2.93779969e-02 -1.62065197e-02  3.47534828e-02
  -8.31341

In [33]:
embeddings.shape

(2, 384)

In [34]:
file_path = '/home/vignes/Patent_Files/zip-Patents/US2022004533 A1'
paragraphs = extract_patent_text(file_path)

chunker = SemanticTextChunker(breakpoint_threshold_type="percentile", threshold=0.75)
chunks = chunker.split_text(paragraphs)

In [37]:
print(f"Number of chunks: {len(chunks)}")
for i, chunk in enumerate(chunks, 1):
    print(f"\nChunk {i} (length: {len(chunk)}):")
    print(chunk[:100] + "..." + chunk[-100:] if len(chunk) > 200 else chunk)

Number of chunks: 3

Chunk 1 (length: 897):
Technique for concurrency control
Abstract
A technique for concurrency control of transactions in a ...based on checking whether the access request belongs to the transaction.
Images (11)
Classifications

Chunk 2 (length: 0):


Chunk 3 (length: 64466):
G06F9/466 Transaction processing
View 3 more classifications
US20220004533A1
United States
Download ... 20 0.000
Show all concepts from the description section
Data provided by IFI CLAIMS Patent Services


In [35]:
embeddings = model.encode(chunks)
print(embeddings)

[[-0.02619893 -0.00393007 -0.11366265 ...  0.05996513  0.0092875
   0.01647916]
 [-0.11883835  0.04829851 -0.00254819 ...  0.1264095   0.04654908
  -0.0157173 ]
 [-0.08920644 -0.04745202 -0.03317064 ...  0.04032691 -0.02105517
  -0.0363211 ]]


In [38]:
embeddings.shape

(3, 384)

In [40]:
file_path = '/home/vignes/Patent_Files/Mount_Std/TS 124 385 v18.0.0/'
paragraphs = extract_standard_text(file_path)

chunker = SemanticTextChunker(breakpoint_threshold_type="percentile", threshold=0.75)
chunks = chunker.split_text(paragraphs)

In [41]:
chunks

['\nContents\nForeword\t9\n1\tScope\t10\n2\tReferences\t10\n3\tDefinitions and abbreviations\t11\n3.1\tDefinitions\t11\n3.2\tAbbreviations\t11\n4\tV2X Communication Provisioning MO\t11\n4.1\tOverview\t11\n5\tMO configuration parameters\t17\n5.1\tGeneral\t17\n5.2\tNode: <X>\t17\n5.3\t<X>/Name\t18\n5.4\tConfiguration parameters for V2X provisioning\t18\n5.4.1\t<X>/V2XProvisioning\t18\n5.4.2\t<X>/V2XProvisioning/V2XControlFunctionAddress\t18\n5.4.3\t<X>/V2XProvisioning/ToConRefs\t18\n5.4.4\t<X>/V2XProvisioning/ToConRefs/<X>\t18\n5.4.5\t<X>/V2XProvisioning/ToConRefs/<X>/ConRef\t19\n5.5\tConfiguration parameters for V2X communication over PC5\t19\n5.5.1\t<X>/V2XoverPC5\t19\n5.5.2\t<X>/V2XoverPC5/Expiration\t19\n5.5.3\t<X>/V2XoverPC5/ServedByEUTRAN\t19\n5.5.4\t<X>/V2XoverPC5/ServedByEUTRAN/AuthorizedPLMNs\t19\n5.5.5\t<X>/V2XoverPC5/ServedByEUTRAN/AuthorizedPLMNs/<X>\t20\n5.5.6\t<X>/V2XoverPC5/ServedByEUTRAN/AuthorizedPLMNs/<X>/PLMN\t20\n5.5.7\t<X>/V2XoverPC5/NotServedByEUTRAN\t20\n5.5.8\t<X>

In [42]:
print(f"Number of chunks: {len(chunks)}")
for i, chunk in enumerate(chunks, 1):
    print(f"\nChunk {i} (length: {len(chunk)}):")
    print(chunk[:100] + "..." + chunk[-100:] if len(chunk) > 200 else chunk)

Number of chunks: 36

Chunk 1 (length: 92413):

Contents
Foreword	9
1	Scope	10
2	References	10
3	Definitions and abbreviations	11
3.1	Definitions	1...e PUBLIC "-//OMA//DTD-DM-DDF 1.2//EN" 
"http://www.openmobilealliance.org/tech/DTD/dm_ddf-v1_2.dtd">

Chunk 2 (length: 105):
<MgmtTree>
	<VerDTD>1.2</VerDTD>
	<Man>--The device manufacturer--</Man>
	<Mod>--The device model--</Mod>

Chunk 3 (length: 4390):
	<Node>
		<NodeName/>
		<DFProperties>
			<AccessType>
				<Get/>
			</AccessType>
			<Description>V...r V2X communication.</DFTitle>
						<DFType>
							<DDFName/>
						</DFType>
					</DFProperties>

Chunk 4 (length: 6607):
					<Node>
						<NodeName></NodeName>
						<DFProperties>
							<AccessType>
								<Get/>
				...				<DFType>
									<MIME>text/plain</MIME>
								</DFType>
							</DFProperties>
						</Node>

Chunk 5 (length: 35):
					</Node>
				</Node>
			</Node>

Chunk 6 (length: 388):
			<Node>
				<NodeName>AuthorizedV2XServiceList</NodeName>
				<DFProperties>

In [43]:
embeddings = model.encode(chunks)
print(embeddings)

[[-0.012714   -0.05649872  0.0118188  ...  0.04505105 -0.03653864
  -0.01637089]
 [-0.04761715  0.01381278 -0.01123928 ...  0.0914061   0.01876348
  -0.01372676]
 [-0.06570857  0.01901135 -0.00460909 ...  0.05670219  0.04818695
   0.03425635]
 ...
 [-0.01802369  0.03168852 -0.03477868 ... -0.00452211  0.08130272
   0.10101572]
 [-0.01810626  0.08527588  0.04912461 ...  0.01577308 -0.03452191
   0.03342958]
 [-0.1188384   0.04829875 -0.00254806 ...  0.12640944  0.0465491
  -0.01571724]]


In [45]:
embeddings.shape

(36, 384)