In [1]:
import re
import pandas as pd
import numpy as np
import faiss
from transformers import AutoTokenizer, AutoModel
import torch

df = pd.read_csv(r"DF.csv", header=0, index_col=0, encoding='utf-8').reset_index()

model_name = "moka-ai/m3e-base"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text, window_size=510, stride=255):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i+window_size]
        if len(chunk) > window_size:
            chunk = chunk[:window_size]
        chunks.append([tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id])
    
    embeddings = []
    for chunk in chunks:
        inputs = {"input_ids": torch.tensor([chunk])}
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.config.hidden_size)

# 创建新的列 'DDL'，整合 'Description' 和 'Layer'
df['DDL'] = df.apply(lambda row: f"{row['Description']}, {row['Layer']}", axis=1)

# 为 'DDL' 列生成嵌入
df["embedding"] = df["DDL"].apply(get_embedding)

# 将嵌入转换为 NumPy 数组
embeddings_array = np.vstack(df["embedding"].values)
print("Shape:", embeddings_array.shape)  

# 创建 FAISS 索引
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_array.astype('float32'))  

print("Done")
#

Shape: (713, 768)
Done


In [None]:
model_name = "moka-ai/m3e-base"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text, window_size=510, stride=255):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i+window_size]
        if len(chunk) > window_size:
            chunk = chunk[:window_size]
        chunks.append([tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id])
    
    embeddings = []
    for chunk in chunks:
        inputs = {"input_ids": torch.tensor([chunk])}
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.config.hidden_size)

# 创建新的列 'DDL'，整合 'Description' 和 'Layer'
df['DDL'] = df.apply(lambda row: f"{row['Description']}, {row['Layer']}", axis=1)

# 为 'DDL' 列生成嵌入
df["embedding"] = df["DDL"].apply(get_embedding)

# 将嵌入转换为 NumPy 数组
embeddings_array = np.vstack(df["embedding"].values)
print("Shape:", embeddings_array.shape)  

# 创建 FAISS 索引
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_array.astype('float32'))  

print("Done")
#
layers = ['FIL', 'MD', 'ALL', 'CDG', 'HDG', 'GRA']

# 创建六个空的 DataFrame
df_dict = {layer: pd.DataFrame() for layer in layers}

# 将每个 layer 的行分别保存到对应的 DataFrame 中
for layer in layers:
    df_dict[layer] = df[df['Layer'] == layer]

def create_rag_text(df, query_text, sample_ratio=0.05):
    if df.empty:
        return []
    
    sample_size = max(int(len(df) * sample_ratio), 1)
    sample_size = min(sample_size, len(df))
    
    query_embedding = get_embedding(query_text)
    query_embedding = np.array([query_embedding], dtype='float32')
    
    distances, indices = index.search(query_embedding, sample_size)
    sampled_indices = indices[0]
    
    valid_indices = [idx for idx in sampled_indices if idx < len(df)]
    if not valid_indices:
        return []
    
    sampled_df = df.iloc[valid_indices]
    return sampled_df['DDL'].tolist()


RAG_T=[]
for block in blocks:
# block=blocks[0]
    all_samples = []
    for layer in layers:
        samples = create_rag_text(df_dict[layer], query_text=block)
        all_samples.extend(samples)

    # 去重并整合成 RAG_text
    rag_text = '\n'.join(list(dict.fromkeys(all_samples)))
    RAG_T.append(rag_text)
    print(rag_text)
    print('===============')

In [None]:
query_text1 ="""
0m,loose, grey to yellowish fine sand
7.4m,very soft, dark grey clay, with occasional humus
18.7m,loose, grey coarse sand
19.5m,firm, grey to yellowish clay locally laminated with sand
25m,soft, dark grey clay, with occasional humus
25.7m,firm, grey to yellowish clay locally laminated with sand
28.9m,soft to firm, grey clay
32.4m,medium dense, grey to yellowish coarse sand
34m,firm, grey clay
36.5m,weak, grayish yellow, pinkish red and grayish white, completely decomposed granite
37.6m,strong, grayish yellow, pinkish red and grayish white, moderately decomposed granite
38.2m,weak, grayish yellow, pinkish red and grayish white, completely decomposed granite
42m,weak, grayish yellow, pinkish red and grayish white, highly decomposed granite
48.32m,strong to very strong, grayish yellow pinkish red and grayish white, moderately decomposed granite
"""
query_text2 = """
0m,loose, grey to yellowish fine sand
9.5m,loose, grey coarse sand with very soft clay
13.5m,very soft, dark grey clay, occasionally with humus, fine sand and shells
16.8m,firm to stiff, grey to yellowish clay locally laminated with sand
20.5m,very soft, dark grey clay, occasionally with organic materials
23.2m,strong, grayish yellow, pinkish red and grayish white, moderately decomposed granite
24.57m,stiff to very stiff, grayish yellow residual soil
28.72m,strong to very strong, grayish yellow, pinkish red and grayish white, moderately decomposed granite
"""
query_text3 = """
0m,loose, grey to yellowish fine sand, with cobbles
9.5m,loose, greyish yellow coarse sand with shell fragments
15.4m,soft, dark grey clay, occasionally with humus and sand
18m,stiff, greyish yellow clay, locally laminated with sand
21m,very soft, dark grey clay, occasionally with humus
25m,stiff to very stiff, grayish yellow sandy clay 
28.5m,weak, grayish yellow, pinkish red and grayish white, completely decomposed granite
36.37m,strong, grayish yellow, pinkish red and grayish white, highly decomposed granite
44.7m,strong to very strong, grayish yellow, pinkish red and grayish white, moderately decomposed granite
"""
query_text4 = """
0m,cobbles with concrete and construction waste
9.5m,loose, grey to yellowish grey fine sand
11.5m,soft to firm, dark grey clay, with organic materials and shell fragments
15m,loose, grey to yellowish grey medium sand
17m,soft, dark grey clay, occasionally with organic materials and fine sand
17.7m,stiff to very stiff, grayish yellow residual soil
20.66m,moderately strong to strong, greyish yellow, yellowish pink, moderately decomposed medium to coarse grained granite
"""
query_text5 = """
0m,boulders involved with sand, granite
7.8m,loose, grey fine sand, with shell fragments, occasionally with very soft clay
10.2m,very soft, dark grey clay, occasionally with organic materials and shell fragments
13.5m,loose to medium dense, grey to yellowish grey coarse sand
17.6m,firm, grey to yellowish grey clay occasionally with sand
20.5m,medium dense, grey to yellowish grey coarse clayey sand
21.5m,firm, grey to yellowish grey coarse sandy clay
24.2m,firm, grey clay occasionally with fine sand and organic materials
26.5m,loose, grey to yellowish grey coarse clayey sand
27.6m,medium dense, grey to yellowish grey coarse sand
31m,firm, grey to yellowish grey coarse sandy clay
32.5m,stiff to very stiff, grayish yellow residual soil
36.5m,extremely weak, grayish yellow pinkish red and grayish white, completely decomposed granite
38.5m,extremely weak, grayish yellow pinkish red and grayish white, highly decomposed granite
42.7m,medium strong, grayish yellow pinkish red and grayish white, moderately decomposed granite
"""
query_text6 = """
0m,loose, grey to yellowish fine sand
7.5m,very soft, dark grey clay, occasionally with shell fragments and organic materials
16m,loose, grey coarse sand with around 15% very soft clay
18.3m,soft to stiff, grey clay locally laminated with sand
20.6m,very soft, dark grey clay, occasionally with organic materials
29m,firm to stiff, grey clay
35m,weak, grayish yellow pinkish red and grayish white, completely decomposed granite
37m,weak, grayish yellow pinkish red and grayish white, highly decomposed granite
40.3m,strong, grayish yellow pinkish red and grayish white, moderately decomposed granite
42.14m,weak, grayish yellow pinkish red and grayish white, highly decomposed granite
43.54m,strong to very strong, grayish yellow pinkish red and grayish white, moderately decomposed granite
"""
query_text7 = """
0m,loose, occasionally coarse dense, brown medium backfill sand
2.8m,very soft, dark grey clay, occasionally with organic materials and shell fragments
15.6m,firm, grey clay, locally laminated with sand
22.5m,loose, grey coarse sand
26.5m,soft, grey clay locally laminated with sand
29.8m,firm to stiff, grey clay with sand
32m,stiff to very stiff, grayish yellow residual soil
36m,weak, grayish yellow pinkish red and grayish white, completely decomposed granite
42m,weak, grayish yellow pinkish red and grayish white, highly decomposed granite
56.55m,strong to very strong, grayish yellow pinkish red and grayish white, moderately decomposed granite
"""
query_text8 = """
0m,loose, grey to yellowish fine sand
7.6m,very soft, dark grey clay, with occasional humus
8.5m,loose, grey coarse sand
12.9m,weak, grayish yellow pinkish red and grayish white, completely decomposed granite
18m,weak, grayish yellow pinkish red and grayish white, highly decomposed granite
31.98m,strong to very strong, grayish yellow pinkish red and grayish white, moderately decomposed granite
"""
query_text9 = """
0m,loose, grey to yellowish fine sand
5.8m,very soft, dark grey clay, with occasional humus
8.6m,loose, grey coarse sand with very soft clay
12.5m,very soft, dark grey clay, with occasional humus
16.5m,loose, grey coarse sand occasionally with clay
20.5m,stiff to very stiff, grayish yellow residual soil
25m,weak, grayish yellow pinkish red and grayish white, completely decomposed granite
29m,weak, grayish yellow pinkish red and grayish white, highly decomposed granite
35.05m,strong to very strong, grayish yellow pinkish red and grayish white, moderately decomposed granite
"""
query_text10 = """
0m,medium dense, greyish yellow, coarse sand, locally with some gravel
7.2m,loose to medium dense, greyish yellow, fine sand
9.8m,very soft, grey, mud, with shell fragments and coarse sand
22.1m,firm, greyish yellow, greyish white and pinkish red, clay, mottled with coarse sand
23.9m,soft, grey, mud, with shell fragments and humic matter
27.6m,firm, grey, clay, with shell fragments and medium sand
31.7m,stiff, greyish white and greyish yellow, clay, mottled with fine sand
36.9m,firm to stiff, grey, clay, mottled with humic matter
41.5m,medium dense, lightly grey, coarse sand, mottled with clay and medium sand
43.2m,greyish yellow and greyish white, residual soil, the core recovered as medium dense silty, slakes readily in water
"""
query_text11 = """
0m,greyish white and greyish yellow, cobbles & boulders
18.4m,loose, grey, black, medium sand, mottled with some mud and shell fragments
18.9m,firm, greyish yellow, clay, locally with some coarse sand
21.9m,medium dense, greyish yellow and grey, coarse sand, with some clay
22.9m,greyish yellow and greyish white, residual soil, the core recovered as medium dense silty, slakes readily in water
26.8m,very weak, greyish yellow and greyish white, brownish red, completely decomposed, coarse-grained granite
29m,weak, greyish yellow and greyish white, brownish red, highly decomposed, coarse-grained granite
33.35m,moderately strong, greyish yellow and pinkish red spotted with single black biotite crystals and occasional clusters of small biotite flakes, inequigranular, moderately decomposed, coarse-grained granite
"""
query_text12 = """
0m,loose, greyish yellow fine sand
4m,very soft, dark grey clay, occasionally with organic materials
10m,loose, grey coarse sand with shell fragments
17.5m,firm, grey clay locally laminated with sand
22.2m,soft to firm, greyish yellow clay
30.3m,strong to very strong, grayish yellow, pinkish red and grayish white, moderately decomposed granite
31.35m,stiff to very stiff, grayish yellow residual soil
35m,weak, grayish yellow, pinkish red and grayish white, completely decomposed granite
41.9m,strong to very strong, grayish yellow, pinkish red and grayish white, moderately decomposed granite
"""
query_text13 = """
0m,loose, yellowish brown, sandy gravel silty/clay with many subangular to subrounded cobbles and occasional rounded boulders, with many domestic refuse, pieces of concrete, masonry, brick
3m,loose, grayish yellow, gravelly sand with some shell fragments
6.2m,loose to medium dense, grayish yellow, fine sand
10.5m,loose to medium dense, grayish yellow, gravelly sand
15.5m,medium dense to dense, gray, locally with grayish yellow, coarse sand
21m,dense, gray, medium sand
23m,weak, grayish yellow, brownish red spotted with grayish white, highly decomposed, coarse-grained granite
23.8m,strong, grayish white spotted with single black biotite crystals and occasional clusters of small biotite flakes, inequigranular, moderately decomposed, coarse-grained granite
25.05m,medium dense, grayish yellow, silty, slakes readily in water, residual soil
33m,extremely weak, grayish yellow, brownish red, spotted with grayish white, completely decomposed, coarse-grained granite
36.8m,weak, grayish yellow, brownish red spotted with grayish white, highly decomposed, coarse-grained granite
65.2m,strong, grayish white spotted with single black biotite crystals and occasional clusters of small biotite flakes, inequigranular, moderately decomposed, coarse-grained granite
"""
query_text14 = """
0m,loose, greyish yellow, coarse sand with little fine gravel
5.5m,soft, grey, clay with little shell fragments and fine sand
7m,loose, grey Fine sand with some shell fragment and little clay
9.1m,soft to firm, grey, clay with little shell fragments and fine sand
17.3m,stiff, greyish yellow and brownish red spotted greyish white, silty sand, with a little fine sand
19.1m,medium dense, greyish yellow, clayey sand
23.1m,stiff, greyish white, greyish yellow and brownish red clay
27m,stiff to very stiff, greyish yellow and brownish red spotted grayish white, clay
32.5m,dense, greyish yellow, clayey medium to coarse sand
"""
query_text15 = """
0m,loose, greyish yellow and brownish red, very silty fine to coarse sand with occasional gravel
4.9m,very soft, grey and brownish grey, silty clay with occasional fine sand and shell fragments
8m,very loose to lose, grey, clayey silty coarse sand with some shell fragments
10.5m,medium dense, orangish brown, silty gravelly coarse sand
13.5m,firm, reddish brown, very sandy silty clay
14.5m,medium dense, greyish yellow spotted brownish red, gravelly coarse sand, with occasional clay
16.8m,firm to stiff, grey and light grey, greyish yellow, silty clay with occasional fine to medium sand
19m,medium dense to dense, orangish brown, gravelly coarse sand
23m,extremely weak, reddish brown and greyish white, completely decomposed coarse grained granite
34m,extremely weak, reddish brown, brownish yellow and greyish white, brownish red, completely decomposed coarse-grained granite
40.6m,weak, brownish yellow and greyish white, highly decomposed coarse-grained granite
41.6m,moderately strong to strong, brownish white, greyish brown spotted pinkish red, moderately to slightly decomposed coarse-grained granite
"""
query_text16 = """
0m,cobbles with concrete and construction waste
9.5m,loose, grey to yellowish grey fine sand
11.5m,soft to firm, dark grey clay, with organic materials and shell fragments
15m,loose, grey to yellowish grey medium sand
17m,soft, dark grey clay, occasionally with organic materials and fine sand
17.7m,stiff to very stiff, grayish yellow residual soil
20.66m,moderately strong to strong, greyish yellow, yellowish pink, moderately decomposed medium to coarse grained granite
"""
query_text17 = """
0m,loose to medium dense, grayish yellow, medium to coarse sand, occasionally with gravels and cobbles
4.7m,firm, gray, coarse sand with some mud
7m,very soft, gray, mud, with a little fine sand
12.1m,very loose, gray, medium sand, with some shell fragments and clay
14.2m,medium dense, grayish yellow and brownish red, silty 
17m,strong, greyish white and pink mottled greyish green, moderately decomposed medium to coarse grained granite
17.45m,extremely weak, greyish yellow and grayish white mottled pink, completely decomposed granite
21.9m,strong, greyish white and pink mottled greyish green, moderately decomposed medium to coarse grained granite
"""
query_text18 = """
0m,loose, grey to yellowish fine sand
7.5m,very soft, dark grey clay, occasionally with shell fragments and organic materials
16.5m,loose, grey coarse sand with around 15% very soft clay
18.7m,soft to stiff, grey clay locally laminated with sand
21m,very soft, dark grey clay, occasionally with organic materials
29.2m,firm to stiff, grey clay
35m,weak, grayish yellow pinkish red and grayish white, completely decomposed granite
40.2m,strong, grayish yellow pinkish red and grayish white, moderately decomposed granite
"""
query_text19 = """
0m,loose, grey to yellowish fine sand
4.5m,very soft, dark grey clay, occasionally with organic materials
13m,loose, grey coarse sand
17.5m,firm, dark grey clay, occasionally with organic materials
21m,medium dense, grey to yellowish coarse sand
23.5m,soft, dark grey clay, occasionally with organic materials
31m,firm, grey to yellowish clay locally laminated with sand
33.5m,weak, grayish yellow pinkish red and grayish white, completely decomposed granite
38.7m,strong grayish yellow pinkish red and grayish white, moderately decomposed granite occasional with highly decomposed granite
40.2m,weak, grayish yellow pinkish red and grayish white, highly decomposed granite
42.62m,strong to very strong, grayish yellow pinkish red and grayish white, moderately decomposed granite
"""
query_text20 = """
0m,loose, greyish yellow fine sand
7.8m,very soft, dark grey clay, occasionally with humus
9.8m,loose, grey coarse sand
17.3m,firm, grey clay locally laminated with sand
20m,firm, greyish yellow clay
22.8m,soft, dark grey clay, occasionally with humus
28m,stiff to very stiff, grayish yellow sandy clay
31m,weak, grayish yellow pinkish red and grayish white, completely decomposed granite
34.5m,weak, grayish yellow pinkish red and grayish white, highly decomposed granite
36.86m,strong to very strong, grayish yellow pinkish red and grayish white, moderately decomposed granite with highly decomposed
37.86m,weak, grayish yellow pinkish red and grayish white, highly decomposed granite
40.5m,strong to very strong, grayish yellow pinkish red and grayish white, moderately decomposed granite 
"""
# 使用正则表达式分割文本块
blocks = [query_text1.replace('\n', ' '),query_text2.replace('\n', ' ')]
blocks

[' 0m,loose, grey to yellowish fine sand 7.4m,very soft, dark grey clay, with occasional humus 18.7m,loose, grey coarse sand 19.5m,firm, grey to yellowish clay locally laminated with sand 25m,soft, dark grey clay, with occasional humus 25.7m,firm, grey to yellowish clay locally laminated with sand 28.9m,soft to firm, grey clay 32.4m,medium dense, grey to yellowish coarse sand 34m,firm, grey clay 36.5m,weak, grayish yellow, pinkish red and grayish white, completely decomposed granite 37.6m,strong, grayish yellow, pinkish red and grayish white, moderately decomposed granite 38.2m,weak, grayish yellow, pinkish red and grayish white, completely decomposed granite 42m,weak, grayish yellow, pinkish red and grayish white, highly decomposed granite 48.32m,strong to very strong, grayish yellow pinkish red and grayish white, moderately decomposed granite ',
 ' 0m,loose, grey to yellowish fine sand 9.5m,loose, grey coarse sand with very soft clay 13.5m,very soft, dark grey clay, occasionally with

In [None]:
# 定义六种 layer 值
layers = ['FIL', 'MD', 'ALL', 'CDG', 'HDG', 'GRA']

# 创建六个空的 DataFrame
df_dict = {layer: pd.DataFrame() for layer in layers}

# 将每个 layer 的行分别保存到对应的 DataFrame 中
for layer in layers:
    df_dict[layer] = df[df['Layer'] == layer]

def create_rag_text(df, query_text, sample_ratio=0.05):
    if df.empty:
        return []
    
    sample_size = max(int(len(df) * sample_ratio), 1)
    sample_size = min(sample_size, len(df))
    
    query_embedding = get_embedding(query_text)
    query_embedding = np.array([query_embedding], dtype='float32')
    
    distances, indices = index.search(query_embedding, sample_size)
    sampled_indices = indices[0]
    
    valid_indices = [idx for idx in sampled_indices if idx < len(df)]
    if not valid_indices:
        return []
    
    sampled_df = df.iloc[valid_indices]
    return sampled_df['DDL'].tolist()


RAG_T=[]
for block in blocks:
# block=blocks[0]
    all_samples = []
    for layer in layers:
        samples = create_rag_text(df_dict[layer], query_text=block)
        all_samples.extend(samples)

    # 去重并整合成 RAG_text
    rag_text = '\n'.join(list(dict.fromkeys(all_samples)))
    RAG_T.append(rag_text)
    print(rag_text)
    print('===============')

0 m, medium dense, grayish yellow, fine sand. , FIL
3.1 m, boulders with sand , FIL
3 m, loose to medium dense, greyish yellow, fine sand, FIL
6 m, soft, grey to dark grey, clay, MD
2.5 m, soft, dark grey, mud mixed some with sand and shell, MD
21.8 m, medium dense, greyish yellow, silty coarse sand with fine gravel , ALL
26.5 m, dense, brownish yellow, gravelly coarse sand, with occasional clay, ALL
25 m, firm to stiff, greyish yellow and brownish yellow, very sandy silty clay, sand is medium to coarse, ALL
17 m, dense, greyish yellow and light greyish white, silty gravelly coarse sand, locally with sandy clay, ALL
17.8 m, medium dense, greyish yellow and greyish white, silty coarse sand with occasional gravel, ALL
40.7 m, stiff, grayish white, clay, ALL
34.6 m, loose to medium dense, grayish yellow and grayish white, silty. , CDG
36.5 m, extremely weak, brownish red and greyish yellow, completely decomposed granite, CDG
34 m, dense to very dense, brownish yellow mottled red and cream