In [None]:
import pandas as pd


df = pd.read_csv('testbase.csv')

query_texts = {}

for i in range(1, 21):
    hole_id = f'BH{i}'
    mask = df['Hole_ID'] == hole_id

    hole_df = df.loc[mask].copy()
    
    hole_df['formatted'] = hole_df['Depth'].astype(str) + 'm, ' + hole_df['description']
    formatted_text = '\n'.join(hole_df['formatted'].tolist())
    
    query_texts[f'query_text{i}'] = formatted_text

for i in range(1, 21):
    key = f'query_text{i}'
    print(f'===== {key} =====')
    print(query_texts[key])
    print()

===== query_text1 =====
0.0m, loose, gray to yellowish fine sand
7.4m, very soft, dark gray clay, occasional humus
18.7m, loose, gray coarse sand
19.5m, firm, gray to yellowish clay locally laminated, sand
25.0m, soft, dark gray clay, occasional humus
25.7m, firm, gray to yellowish clay locally laminated, sand
28.9m, soft to firm, gray clay
32.4m, medium dense, gray to yellowish coarse sand
34.0m, firm, gray clay
36.5m, weak, grayish yellow, pinkish red, grayish white, completely decomposed granite
37.6m, strong, grayish yellow, pinkish red, grayish white, moderately decomposed granite
38.2m, weak, grayish yellow, pinkish red, grayish white, completely decomposed granite
42.0m, weak, grayish yellow, pinkish red, grayish white, highly decomposed granite
48.32m, strong to very strong, grayish yellow pinkish red, grayish white, moderately decomposed granite

===== query_text2 =====
0.0m, loose, gray to yellowish fine sand
9.5m, loose, gray coarse sand, very soft clay
13.5m, very soft, dar

In [None]:
blocks = [query_texts[f'query_text{i}'] for i in range(1, 21)]
blocks

['0.0m, loose, gray to yellowish fine sand\n7.4m, very soft, dark gray clay, occasional humus\n18.7m, loose, gray coarse sand\n19.5m, firm, gray to yellowish clay locally laminated, sand\n25.0m, soft, dark gray clay, occasional humus\n25.7m, firm, gray to yellowish clay locally laminated, sand\n28.9m, soft to firm, gray clay\n32.4m, medium dense, gray to yellowish coarse sand\n34.0m, firm, gray clay\n36.5m, weak, grayish yellow, pinkish red, grayish white, completely decomposed granite\n37.6m, strong, grayish yellow, pinkish red, grayish white, moderately decomposed granite\n38.2m, weak, grayish yellow, pinkish red, grayish white, completely decomposed granite\n42.0m, weak, grayish yellow, pinkish red, grayish white, highly decomposed granite\n48.32m, strong to very strong, grayish yellow pinkish red, grayish white, moderately decomposed granite',
 '0.0m, loose, gray to yellowish fine sand\n9.5m, loose, gray coarse sand, very soft clay\n13.5m, very soft, dark gray clay, occasionally, h

In [None]:
import re
import pandas as pd
import numpy as np
import faiss
from transformers import AutoTokenizer, AutoModel
import torch

df = pd.read_csv(r"TRAIN.csv", header=0, index_col=0, encoding='utf-8').reset_index()

model_name = "moka-ai/m3e-base"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text, window_size=510, stride=255):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i+window_size]
        if len(chunk) > window_size:
            chunk = chunk[:window_size]
        chunks.append([tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id])
    
    embeddings = []
    for chunk in chunks:
        inputs = {"input_ids": torch.tensor([chunk])}
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.config.hidden_size)

df['DDL'] = df.apply(lambda row: f"{row['Description']}, {row['Layer']}", axis=1)
df["embedding"] = df["DDL"].apply(get_embedding)

embeddings_array = np.vstack(df["embedding"].values)
print("Shape:", embeddings_array.shape)  

dimension = embeddings_array.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_array.astype('float32'))  

print("Done")

In [None]:
import pandas as pd
import numpy as np
import faiss


layers = ['FIL', 'MD', 'ALL', 'CDG', 'HDG', 'GRA']
layer_systems = {}

for layer in layers:

    layer_df = df[df['Layer'] == layer].copy().reset_index(drop=True)
    
    if layer_df.empty:
        print(f"Error：{layer} No data")
        continue

    if 'embedding' not in layer_df.columns:
        raise ValueError(f" 'embedding' in {layer} is missing")
    
    embeddings = np.stack(layer_df['embedding'].values).astype('float32')
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    
    layer_systems[layer] = {
        'data': layer_df,
        'index': index
    }

def retrieve_layer_samples(layer_system, query_text, sample_ratio=0.05):
 
    data = layer_system['data']
    index = layer_system['index']
    
    sample_size = max(1, min(
        int(len(data) * sample_ratio),
        len(data)
    ))
    
    query_embed = get_embedding(query_text).astype('float32')

    _, indices = index.search(query_embed.reshape(1, -1), sample_size)
    valid_indices = [idx for idx in indices[0] if 0 <= idx < len(data)]
    
    return data.iloc[valid_indices]['DDL'].tolist()

RAG_T = []
for block in blocks:
    block_results = []
    
    for layer in layers:
        if layer not in layer_systems:
            continue
        
        samples = retrieve_layer_samples(
            layer_system=layer_systems[layer],
            query_text=block,
            sample_ratio=0.05
        )
        
        unique_samples = list(dict.fromkeys([f"{s}" for s in samples]))
        block_results.extend(unique_samples)
    
    RAG_T.append('\n'.join(block_results))
    print()

for i, rag_text in enumerate(RAG_T):
    print(f"\n{rag_text}")






















0 meter, loose, grayish yellow and gray, fine sand, FIL
0 meter, medium dense, brownish gray, grayish yellow and brownish red, gravelsly coarse sand, with occasional cobbles, FIL
3.5 meter, grayish white and grayish yellow, FIL
0 meter, grayish white and grayish yellow, cobbles & boulders, FIL
7.5 meter, grayish white and grayish yellow, FIL
0 meter, soft, dark gray to gray, mud, MD
9 meter, wet, soft, light yellow to yellow clay, MD
12 meter, wet, soft light gray to gray and light yellow clay, MD
6 meter, soft, gray to dark gray, clay, MD
20.3 meter, medium dense, gray, grayish white, spottled grayish yellow, coarse sand, locally with a little clay , ALL
38.8 meter, dense, grayish yellow, grayish white, coarse sand, ALL
41.5 meter, firm, grayish yellow and grayish white, clay, ALL
11.8 meter, medium dense, grayish white and grayish yellow, silty coarse sand, ALL
36.5 meter, dense, grayish white and grayish yellow, fine sand. , ALL
32.5 meter, medium dense, grayish