In [2]:
!pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.48.1-py3-none-any.whl.metadata (44 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.

In [2]:
from util import parse_markdown_to_dataframe
from tqdm import tqdm
from pathlib import Path
import os

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
src_dir = Path(os.getcwd())
data_dir_path = Path(src_dir, "data")
ISO26262_10_path = Path(data_dir_path, 'ISO26262-10.md')
ISO26262_8_path = Path(data_dir_path, 'ISO26262-8.md')
EN50128_path = Path(data_dir_path, 'EN50128.md')

In [5]:
df = parse_markdown_to_dataframe(EN50128_path)

In [6]:
df.head(15)

Unnamed: 0,Number,Heading,Clause
0,8,,The principles applied in developing high inte...
1,1,Scope,
2,1.1,,This European Standard specifies the process a...
3,1.2,,This European Standard is applicable exclusive...
4,1.3,,This European Standard is not relevant for sof...
5,1.4,,This European Standard applies to all safety r...
6,1.5,,This European Standard also addresses the use ...
7,1.6,,Software developed according to any version of...
8,1.7,,This European Standard considers that modern a...
9,1.8,,This European Standard is not intended to addr...


In [7]:
model_names = ['Alibaba-NLP/gte-Qwen2-1.5B-instruct', 'all-mpnet-base-v2', 'Alibaba-NLP/gte-large-en-v1.5']

In [8]:
tqdm.pandas()

In [10]:
from sentence_transformers import SentenceTransformer

In [14]:
model = SentenceTransformer(model_names[1], trust_remote_code=True)

In [12]:
def get_embeddings_batch(texts, model, batch_size=16):
    #model = model.to('cuda')
    return model.encode(texts, batch_size=batch_size, show_progress_bar=True, device='cuda')

In [9]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [13]:
from torch.cuda.amp import autocast

In [15]:
df[f"embed_{model_names[1]}"] = list(get_embeddings_batch(df['Clause'].tolist(), model=model, batch_size=16))

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

In [11]:
import torch

In [19]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 1            |        cudaMalloc retries: 1         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  16145 MiB |  20171 MiB |  43853 MiB |  27707 MiB |
|       from large pool |  16143 MiB |  20168 MiB |  43846 MiB |  27702 MiB |
|       from small pool |      2 MiB |      3 MiB |      7 MiB |      4 MiB |
|---------------------------------------------------------------------------|
| Active memory         |  16145 MiB |  20171 MiB |  43853 MiB |  27707 MiB |
|       from large pool |  16143 MiB |  20168 MiB |  43846 MiB |  27702 MiB |
|       from small pool |      2 MiB |      3 MiB |      7 MiB |      4 MiB |
|---------------------------------------------------------------

/bin/bash: line 1: nvidia-smi: command not found


In [14]:
import gc
del variables
gc.collect()

NameError: name 'variables' is not defined

In [13]:
torch.cuda.empty_cache()

In [16]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [20]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [10]:
!ps -p 5442

  PID TTY          TIME CMD


In [3]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [30]:
import torch

# Total GPU memory
total_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)  # Convert to GB

# Allocated GPU memory
allocated_memory = torch.cuda.memory_allocated(0) / (1024 ** 3)  # Convert to GB

# Cached GPU memory
cached_memory = torch.cuda.memory_reserved(0) / (1024 ** 3)  # Convert to GB

print(f"Total Memory: {total_memory:.2f} GB")
print(f"Allocated Memory: {allocated_memory:.2f} GB")
print(f"Cached Memory: {cached_memory:.2f} GB")

Total Memory: 21.98 GB
Allocated Memory: 14.55 GB
Cached Memory: 19.69 GB


In [20]:
torch.cuda.empty_cache()

In [16]:
df.head()

Unnamed: 0,Number,Heading,Clause,embed_Alibaba-NLP/gte-large-en-v1.5,embed_all-mpnet-base-v2
0,8.0,,The principles applied in developing high inte...,"[-0.68813276, -0.590403, 1.2712662, -0.0412018...","[0.0037549057, -0.050997127, 0.0053323926, -0...."
1,1.0,Scope,,"[-0.040462002, -0.2670594, 0.042161457, 0.0144...","[-0.012503407, 0.06143875, -0.006734512, 0.025..."
2,1.1,,This European Standard specifies the process a...,"[-0.4599365, -0.40616006, 0.9185905, 0.3532599...","[-0.040297795, -0.050905235, -0.026976524, 0.0..."
3,1.2,,This European Standard is applicable exclusive...,"[-0.75160563, -0.83683866, -0.33406588, -0.495...","[-0.0038164675, -0.07827046, 0.011192447, 0.00..."
4,1.3,,This European Standard is not relevant for sof...,"[-1.1059895, -0.0582044, -0.59873945, -0.54984...","[-0.031497475, -0.019634753, 0.015145084, -0.0..."


In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Stack embeddings into a numpy array
embeddings = np.vstack(df['embed_Alibaba-NLP/gte-large-en-v1.5'].values)

# Step 2: Compute cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

# Step 3: Verify each embedding is closest to itself
results = []
for i in range(len(embeddings)):
    closest_index = np.argmax(similarity_matrix[i])  # Find the index of the highest similarity
    if closest_index == i:
        results.append((df['Clause'][i], 'Pass', similarity_matrix[i][i]))
    else:
        results.append((df['Clause'][i], f'Closest to {df["Clause"][closest_index]}', similarity_matrix[i][closest_index]))

# Step 4: Create a results DataFrame
results_df = pd.DataFrame(results, columns=['Clause', 'Check', 'Similarity'])

# Display the results
print(results_df)


                                                Clause Check  Similarity
0    The principles applied in developing high inte...  Pass         1.0
1                                                       Pass         1.0
2    This European Standard specifies the process a...  Pass         1.0
3    This European Standard is applicable exclusive...  Pass         1.0
4    This European Standard is not relevant for sof...  Pass         1.0
..                                                 ...   ...         ...
708  An ancillary data structure depicting the domi...  Pass         1.0
709  An ancillary data structure depicting the domi...  Pass         1.0
710  An ancillary data structure depicting the domi...  Pass         1.0
711  An ancillary data structure depicting the domi...  Pass         1.0
712  An ancillary data structure depicting the domi...  Pass         1.0

[713 rows x 3 columns]
