In [40]:
import pandas as pd

In [41]:
df = pd.read_csv("rfq.csv")
df.head()

Unnamed: 0,id,grade,grade_suffix,coating,finish,surface_type,surface_protection,form,thickness_min,thickness_max,...,weight_min,weight_max,inner_diameter_min,inner_diameter_max,outer_diameter_min,outer_diameter_max,yield_strength_min,yield_strength_max,tensile_strength_min,tensile_strength_max
0,8aff426d-b8c0-43aa-ad26-835ef4de6129,S700MC,,,Oiled,,,Coils,6.0,6.0,...,15000.0,25000.0,610.0,610.0,,,,,760.0,810.0
1,37e624be-b125-464f-85b6-1838530193ef,S250GD,,ZM310,Hot-dip zinc magnesium (+ZM),,,Slit Coils,1.5,1.5,...,,,,,,,,,,
2,b8257184-6307-46ab-b06e-d979336d1263,DX51D,,Z100,Hot-dip Galvanized (+Z/+GI),,Lightly Oiled (L),Coils,0.4,0.4,...,,,,,,,,,,
3,63140d1f-dda8-40fe-8931-bcaba65d5772,S235,,,,,,Round Tubes,1.5,1.5,...,53800.0,53800.0,,,60.3,,,,,
4,11cffc57-44be-4d79-bfd5-97482be566d3,S235,,,,,,Round Tubes,1.5,1.5,...,14500.0,14500.0,,,48.3,,,,,


In [42]:
df1 = pd.read_csv("reference_properties.tsv", sep="\t")
df1.head()

Unnamed: 0,Grade/Material,UNS_No,Steel_No,Standards,Carbon (C),Manganese (Mn),Silicon (Si),Sulfur (S),Phosphorus (P),Chromium (Cr),...,Reduction of area (Z%),"Hardness (HB, HV, HRC)",Impact toughness (Charpy V-notch),Fatigue limit,Creep resistance,Source_Pages,Application,Category,Nb + V + Ti (Others),Coating
0,S235JR,,,EN 10025-2:2019,≤0.17,≤1.40,≤0.40,≤0.035,≤0.035,,...,,,27J at 20°C,,,Standard Specifications,General structural steels,Structural Steel,,
1,S275JR,,,EN 10025-2:2019,≤0.21,≤1.50,≤0.40,≤0.035,≤0.035,,...,,,27J at 20°C,,,Standard Specifications,General structural steels,Structural Steel,,
2,S355JR,,,EN 10025-2:2019,≤0.24,≤1.60,≤0.55,≤0.035,≤0.035,,...,,,27J at 20°C,,,Standard Specifications,General structural steels,Structural Steel,,
3,S420M,,,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,≤0.30,...,,,27J at -20°C,,,Standard Specifications,Thermomechanically rolled steels,High Strength Steel,,
4,S460M,,,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,≤0.30,...,,,27J at -20°C,,,Standard Specifications,Thermomechanically rolled steels,High Strength Steel,,


# Standardize the grade names by removing suffixes and normalizing the case

In [43]:
import re

def normalize_grade(grade):
    # Convert to uppercase
    grade = str(grade).upper()  # Ensure grade is a string

    # Remove common suffixes (e.g., JR, M, MC, GD)
    grade = re.sub(r'(JR|M|MC|GD)$', '', grade)
    
    return grade

# Normalize the grades in both reference and RFQ datasets
df1['normalized_grade'] = df1['Grade/Material'].apply(normalize_grade)
df['normalized_grade'] = df['grade'].apply(normalize_grade)

# Preview the normalized grades
df1[['Grade/Material', 'normalized_grade']].head(), df[['grade', 'normalized_grade']].head()

(  Grade/Material normalized_grade
 0         S235JR             S235
 1         S275JR             S275
 2         S355JR             S355
 3          S420M             S420
 4          S460M             S460,
     grade normalized_grade
 0  S700MC             S700
 1  S250GD             S250
 2   DX51D            DX51D
 3    S235             S235
 4    S235             S235)

# Function to parse range strings and return min, max, and optionally mid values

In [44]:
import re
import numpy as np


def parse_range(range_str):
    if pd.isna(range_str):
        return np.nan, np.nan, np.nan
    range_str = str(range_str).replace(' ', '')  # Remove spaces
    # Handle ≤ and ≥ separately
    if '≤' in range_str:
        max_value = float(re.sub(r'≤', '', range_str).strip())
        return np.nan, max_value, np.nan
    elif '≥' in range_str:
        min_value = float(re.sub(r'≥', '', range_str).strip())
        return min_value, np.nan, np.nan
    elif '–' in range_str or '-' in range_str:
        # Support both en dash and hyphen
        parts = re.split(r'[–-]', range_str)
        if len(parts) == 2:
            min_value, max_value = map(float, parts)
            mid_value = (min_value + max_value) / 2
            return min_value, max_value, mid_value
        else:
            return np.nan, np.nan, np.nan
    else:
        try:
            value = float(range_str.strip())
            return value, value, value
        except ValueError:
            return np.nan, np.nan, np.nan

# Example application on a column in the reference_properties_df to parse values
df1['Carbon_min'], df1['Carbon_max'], df1['Carbon_mid'] = zip(*df1['Carbon (C)'].apply(parse_range))

# Display the results to ensure proper parsing
df1[['Carbon (C)', 'Carbon_min', 'Carbon_max', 'Carbon_mid']].head()

Unnamed: 0,Carbon (C),Carbon_min,Carbon_max,Carbon_mid
0,≤0.17,,0.17,
1,≤0.21,,0.21,
2,≤0.24,,0.24,
3,≤0.20,,0.2,
4,≤0.20,,0.2,


# Normalize, Parse Range Strings, Join RFQs with Reference and Handling Missing values

In [45]:
# Join RFQs with reference properties on normalized grade
joined_df = df.merge(df1, how='left', left_on='normalized_grade', right_on='normalized_grade', suffixes=('_rfq', '_ref'))

# Flag RFQs whose grade was not found in reference
joined_df['grade_missing_in_reference'] = joined_df['Grade/Material'].isna()

# filling missing Carbon_mid with reference mean
carbon_mean = df1['Carbon_mid'].mean()
joined_df['Carbon_mid'] = joined_df['Carbon_mid'].fillna(carbon_mean)

# Option 3: Flag missing values (already flagged above)

# Preview result
joined_df.head()

Unnamed: 0,id,grade,grade_suffix,coating,finish,surface_type,surface_protection,form,thickness_min,thickness_max,...,Creep resistance,Source_Pages,Application,Category,Nb + V + Ti (Others),Coating,Carbon_min,Carbon_max,Carbon_mid,grade_missing_in_reference
0,8aff426d-b8c0-43aa-ad26-835ef4de6129,S700MC,,,Oiled,,,Coils,6.0,6.0,...,,Standard Specifications,"Cold forming, automotive, high strength applic...",Microalloyed Steel,≤0.22,,,0.12,0.379186,False
1,37e624be-b125-464f-85b6-1838530193ef,S250GD,,ZM310,Hot-dip zinc magnesium (+ZM),,,Slit Coils,1.5,1.5,...,,Standard Specifications,Structural galvanized steel,Galvanized Steel,,Hot-dip galvanized,,0.25,0.379186,False
2,b8257184-6307-46ab-b06e-d979336d1263,DX51D,,Z100,Hot-dip Galvanized (+Z/+GI),,Lightly Oiled (L),Coils,0.4,0.4,...,,Standard Specifications,Galvanized steel for forming,Galvanized Steel,,Hot-dip galvanized,,0.12,0.379186,False
3,63140d1f-dda8-40fe-8931-bcaba65d5772,S235,,,,,,Round Tubes,1.5,1.5,...,,Standard Specifications,General structural steels,Structural Steel,,,,0.17,0.379186,False
4,63140d1f-dda8-40fe-8931-bcaba65d5772,S235,,,,,,Round Tubes,1.5,1.5,...,,Standard Specifications,General structural steel,Structural Steel,,,,0.17,0.379186,False


# Represent each dimension as an interval (min, max)

In [46]:

# For singletons, set min = max = value

def get_interval(row, col_min, col_max):
    min_val = row[col_min]
    max_val = row[col_max]
    if pd.isna(min_val) and not pd.isna(max_val):
        min_val = max_val
    if pd.isna(max_val) and not pd.isna(min_val):
        max_val = min_val
    return min_val, max_val

# Example for Carbon
joined_df['Carbon_interval'] = joined_df.apply(lambda row: get_interval(row, 'Carbon_min', 'Carbon_max'), axis=1)

# Overlap metric: Intersection over Union (IoU)
def interval_iou(interval1, interval2):
    a_min, a_max = interval1
    b_min, b_max = interval2
    if pd.isna(a_min) or pd.isna(a_max) or pd.isna(b_min) or pd.isna(b_max):
        return np.nan
    inter_min = max(a_min, b_min)
    inter_max = min(a_max, b_max)
    intersection = max(0, inter_max - inter_min)
    union = max(a_max, b_max) - min(a_min, b_min)
    if union == 0:
        return 0
    return intersection / union

# Example usage:
# joined_df['Carbon_iou'] = joined_df.apply(lambda row: interval_iou(row['Carbon_interval'], (ref_min, ref_max)), axis=1)

# Define similarity as exact match (1/0) for categorical columns

In [47]:

def exact_match_similarity(row, col1, col2):
    return int(row[col1] == row[col2])

# Use correct column names from your DataFrame
joined_df['coating_similarity'] = joined_df.apply(lambda row: exact_match_similarity(row, 'coating', 'Coating'), axis=1)
joined_df['finish_similarity'] = joined_df.apply(lambda row: exact_match_similarity(row, 'finish', 'Finish'), axis=1) if 'Finish' in joined_df.columns else np.nan
joined_df['form_similarity'] = joined_df.apply(lambda row: exact_match_similarity(row, 'form', 'Form'), axis=1) if 'Form' in joined_df.columns else np.nan
joined_df['surface_type_similarity'] = joined_df.apply(lambda row: exact_match_similarity(row, 'surface_type', 'Surface_type'), axis=1) if 'Surface_type' in joined_df.columns else np.nan

In [48]:
# Use numeric midpoints of ranges for similarity and modeling
# Example for Carbon (already calculated as 'Carbon_mid')

# You can ignore very sparse features (columns with mostly NaN values)
sparse_threshold = 0.9  # Ignore columns with >90% missing values
sparse_features = [col for col in joined_df.columns if joined_df[col].isna().mean() > sparse_threshold]
print("Ignoring sparse features:", sparse_features)

# Select numeric midpoint features (example: Carbon_mid, Manganese_mid, etc.)
midpoint_features = [col for col in joined_df.columns if col.endswith('_mid') and col not in sparse_features]
print("Using midpoint features:", midpoint_features)

# Example: create a DataFrame with only midpoint features for further analysis
midpoints_df = joined_df[midpoint_features].copy()

# You can now use midpoints_df for similarity, clustering, or modeling
midpoints_df.head()

Ignoring sparse features: ['grade_suffix', 'height_max', 'outer_diameter_min', 'outer_diameter_max', 'yield_strength_min', 'yield_strength_max', 'tensile_strength_min', 'tensile_strength_max', 'UNS_No', 'Steel_No', 'Chromium (Cr)', 'Nickel (Ni)', 'Molybdenum (Mo)', 'Tungsten (W)', 'Cobalt (Co)', 'Copper (Cu)', 'Boron (B)', 'Reduction of area (Z%)', 'Hardness (HB, HV, HRC)', 'Fatigue limit', 'Creep resistance', 'Nb + V + Ti (Others)', 'Carbon_min', 'finish_similarity', 'form_similarity', 'surface_type_similarity']
Using midpoint features: ['Carbon_mid']


Unnamed: 0,Carbon_mid
0,0.379186
1,0.379186
2,0.379186
3,0.379186
4,0.379186


# Compute weighted average similarity between two RFQs

In [49]:


# Choose two RFQs by their index or id
rfq1 = joined_df.iloc[0]  # First RFQ
rfq2 = joined_df.iloc[1]  # Second RFQ

# Example weights (adjust as needed)
w_dim = 0.5      # weight for dimension overlap (e.g., cosine similarity)
w_cat = 0.3      # weight for categorical matches (e.g., Jaccard similarity)
w_grade = 0.2    # weight for grade similarity (exact match)

# Compute individual similarities
cosine_sim = weighted_cosine_similarity(rfq1, rfq2, {col: 1 for col in midpoint_features})
cat_cols = ['coating_similarity', 'finish_similarity', 'form_similarity', 'surface_type_similarity']
jaccard_sim = jaccard_similarity(rfq1, rfq2, cat_cols)
grade_sim = int(rfq1['normalized_grade'] == rfq2['normalized_grade'])

# Weighted average similarity
weighted_similarity = w_dim * cosine_sim + w_cat * jaccard_sim + w_grade * grade_sim

print(f"Weighted similarity between RFQ {rfq1['id']} and RFQ {rfq2['id']}: {weighted_similarity:.4f}")

Weighted similarity between RFQ 8aff426d-b8c0-43aa-ad26-835ef4de6129 and RFQ 37e624be-b125-464f-85b6-1838530193ef: 0.5000


# Find top 3 most similar RFQs per line (excluding self and exact categorical matches)

In [50]:


def aggregate_similarity(row1, row2):
    w_dim = 0.5
    w_cat = 0.3
    w_grade = 0.2
    cosine_sim = weighted_cosine_similarity(row1, row2, {col: 1 for col in midpoint_features})
    cat_cols = ['coating_similarity', 'finish_similarity', 'form_similarity', 'surface_type_similarity']
    jaccard_sim = jaccard_similarity(row1, row2, cat_cols)
    grade_sim = int(row1['normalized_grade'] == row2['normalized_grade'])
    return w_dim * cosine_sim + w_cat * jaccard_sim + w_grade * grade_sim

def top3_similar_rfqs(df):
    results = []
    for idx, row in df.iterrows():
        similarities = []
        for idx2, row2 in df.iterrows():
            if idx == idx2 or row['id'] == row2['id']:
                continue
            # Exclude exact match on all categorical features
            if (
                row['coating'] == row2['coating'] and
                row['finish'] == row2['finish'] and
                row['form'] == row2['form'] and
                row['surface_type'] == row2['surface_type']
            ):
                continue
            sim = aggregate_similarity(row, row2)
            similarities.append((row2['id'], sim))
        top3 = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]
        results.append({
            'rfq_id': row['id'],
            'top1_id': top3[0][0] if len(top3) > 0 else None,
            'top1_score': top3[0][1] if len(top3) > 0 else None,
            'top2_id': top3[1][0] if len(top3) > 1 else None,
            'top2_score': top3[1][1] if len(top3) > 1 else None,
            'top3_id': top3[2][0] if len(top3) > 2 else None,
            'top3_score': top3[2][1] if len(top3) > 2 else None,
        })
    return pd.DataFrame(results)

top3_df = top3_similar_rfqs(joined_df)
print(top3_df.head())

                                 rfq_id                               top1_id  \
0  8aff426d-b8c0-43aa-ad26-835ef4de6129  2624655e-ea07-468a-8da8-2e39c9d1e7f0   
1  37e624be-b125-464f-85b6-1838530193ef  25c46875-dbeb-4ef4-ad33-7aff8384fb2a   
2  b8257184-6307-46ab-b06e-d979336d1263  973d80a1-f1b8-461b-bb25-7d8852968b1c   
3  63140d1f-dda8-40fe-8931-bcaba65d5772  11cffc57-44be-4d79-bfd5-97482be566d3   
4  63140d1f-dda8-40fe-8931-bcaba65d5772  11cffc57-44be-4d79-bfd5-97482be566d3   

   top1_score                               top2_id  top2_score  \
0         0.7  b2bc544a-219d-4899-9fa4-191e3f475649         0.7   
1         0.7  99bbb98d-d06d-4633-a68c-9be3bb9f6011         0.7   
2         0.7  8fe87807-dae8-4341-94fd-2ab91f176464         0.7   
3         0.7  11cffc57-44be-4d79-bfd5-97482be566d3         0.7   
4         0.7  11cffc57-44be-4d79-bfd5-97482be566d3         0.7   

                                top3_id  top3_score  
0  267e5bc4-897a-4fc1-97d2-26a45829ba19         0.7  
1 

# Output top-3 most similar RFQs per line as CSV with columns: rfq_id, match_id, similarity_score

In [51]:


def top3_similar_rfqs_flat(df):
    rows = []
    for idx, row in df.iterrows():
        similarities = []
        for idx2, row2 in df.iterrows():
            if idx == idx2 or row['id'] == row2['id']:
                continue
            # Exclude exact match on all categorical features
            if (
                row['coating'] == row2['coating'] and
                row['finish'] == row2['finish'] and
                row['form'] == row2['form'] and
                row['surface_type'] == row2['surface_type']
            ):
                continue
            sim = aggregate_similarity(row, row2)
            similarities.append((row2['id'], sim))
        top3 = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]
        for match_id, score in top3:
            rows.append({'rfq_id': row['id'], 'match_id': match_id, 'similarity_score': score})
    return pd.DataFrame(rows)

top3_flat_df = top3_similar_rfqs_flat(joined_df)
top3_flat_df.to_csv("rfq_top3_matches.csv", index=False)
print("Top-3 similar RFQs per line saved to rfq_top3_matches.csv")

Top-3 similar RFQs per line saved to rfq_top3_matches.csv


# Ablation Analysis

In [52]:
# Change weights and observe changes in top-3 matches

def aggregate_similarity(row1, row2):
    w_dim = 0.7
    w_cat = 0.2
    w_grade = 0.1
    cosine_sim = weighted_cosine_similarity(row1, row2, {col: 1 for col in midpoint_features})
    cat_cols = ['coating_similarity', 'finish_similarity', 'form_similarity', 'surface_type_similarity']
    jaccard_sim = jaccard_similarity(row1, row2, cat_cols)
    grade_sim = int(row1['normalized_grade'] == row2['normalized_grade'])
    return w_dim * cosine_sim + w_cat * jaccard_sim + w_grade * grade_sim

def top3_similar_rfqs(df):
    results = []
    for idx, row in df.iterrows():
        similarities = []
        for idx2, row2 in df.iterrows():
            if idx == idx2 or row['id'] == row2['id']:
                continue
            # Exclude exact match on all categorical features
            if (
                row['coating'] == row2['coating'] and
                row['finish'] == row2['finish'] and
                row['form'] == row2['form'] and
                row['surface_type'] == row2['surface_type']
            ):
                continue
            sim = aggregate_similarity(row, row2)
            similarities.append((row2['id'], sim))
        top3 = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]
        results.append({
            'rfq_id': row['id'],
            'top1_id': top3[0][0] if len(top3) > 0 else None,
            'top1_score': top3[0][1] if len(top3) > 0 else None,
            'top2_id': top3[1][0] if len(top3) > 1 else None,
            'top2_score': top3[1][1] if len(top3) > 1 else None,
            'top3_id': top3[2][0] if len(top3) > 2 else None,
            'top3_score': top3[2][1] if len(top3) > 2 else None,
        })
    return pd.DataFrame(results)

top3_df = top3_similar_rfqs(joined_df)
print(top3_df.head())

                                 rfq_id                               top1_id  \
0  8aff426d-b8c0-43aa-ad26-835ef4de6129  2624655e-ea07-468a-8da8-2e39c9d1e7f0   
1  37e624be-b125-464f-85b6-1838530193ef  25c46875-dbeb-4ef4-ad33-7aff8384fb2a   
2  b8257184-6307-46ab-b06e-d979336d1263  973d80a1-f1b8-461b-bb25-7d8852968b1c   
3  63140d1f-dda8-40fe-8931-bcaba65d5772  11cffc57-44be-4d79-bfd5-97482be566d3   
4  63140d1f-dda8-40fe-8931-bcaba65d5772  11cffc57-44be-4d79-bfd5-97482be566d3   

   top1_score                               top2_id  top2_score  \
0         0.8  b2bc544a-219d-4899-9fa4-191e3f475649         0.8   
1         0.8  99bbb98d-d06d-4633-a68c-9be3bb9f6011         0.8   
2         0.8  8fe87807-dae8-4341-94fd-2ab91f176464         0.8   
3         0.8  11cffc57-44be-4d79-bfd5-97482be566d3         0.8   
4         0.8  11cffc57-44be-4d79-bfd5-97482be566d3         0.8   

                                top3_id  top3_score  
0  267e5bc4-897a-4fc1-97d2-26a45829ba19         0.8  
1 

# Trying IoU similarity to compare score

In [53]:
# Use IoU (Intersection over Union) for numeric similarity instead of cosine similarity

def aggregate_similarity_iou(row1, row2):
    w_dim = 0.5
    w_cat = 0.3
    w_grade = 0.2
    # Calculate IoU for each numeric interval feature and average
    iou_scores = []
    for feature in midpoint_features:
        min_col = feature.replace('_mid', '_min')
        max_col = feature.replace('_mid', '_max')
        interval1 = (row1[min_col], row1[max_col])
        interval2 = (row2[min_col], row2[max_col])
        iou = interval_iou(interval1, interval2)
        if not pd.isna(iou):
            iou_scores.append(iou)
    iou_sim = np.mean(iou_scores) if iou_scores else 0
    # Categorical similarity (Jaccard)
    cat_cols = ['coating_similarity', 'finish_similarity', 'form_similarity', 'surface_type_similarity']
    jaccard_sim = jaccard_similarity(row1, row2, cat_cols)
    # Grade similarity (exact match)
    grade_sim = int(row1['normalized_grade'] == row2['normalized_grade'])
    return w_dim * iou_sim + w_cat * jaccard_sim + w_grade * grade_sim

def top3_similar_rfqs_iou(df):
    rows = []
    for idx, row in df.iterrows():
        similarities = []
        for idx2, row2 in df.iterrows():
            if idx == idx2 or row['id'] == row2['id']:
                continue
            # Exclude exact match on all categorical features
            if (
                row['coating'] == row2['coating'] and
                row['finish'] == row2['finish'] and
                row['form'] == row2['form'] and
                row['surface_type'] == row2['surface_type']
            ):
                continue
            sim = aggregate_similarity_iou(row, row2)
            similarities.append((row2['id'], sim))
        top3 = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]
        for match_id, score in top3:
            rows.append({'rfq_id': row['id'], 'match_id': match_id, 'similarity_score': score})
    return pd.DataFrame(rows)

top3_flat_df_iou = top3_similar_rfqs_iou(joined_df)
print(top3_flat_df_iou.head())

                                 rfq_id                              match_id  \
0  8aff426d-b8c0-43aa-ad26-835ef4de6129  2624655e-ea07-468a-8da8-2e39c9d1e7f0   
1  8aff426d-b8c0-43aa-ad26-835ef4de6129  b2bc544a-219d-4899-9fa4-191e3f475649   
2  8aff426d-b8c0-43aa-ad26-835ef4de6129  267e5bc4-897a-4fc1-97d2-26a45829ba19   
3  37e624be-b125-464f-85b6-1838530193ef  25c46875-dbeb-4ef4-ad33-7aff8384fb2a   
4  37e624be-b125-464f-85b6-1838530193ef  99bbb98d-d06d-4633-a68c-9be3bb9f6011   

   similarity_score  
0               0.2  
1               0.2  
2               0.2  
3               0.2  
4               0.2  


# Clustering

In [54]:
# Group RFQs into families based on top-3 IoU similarity matches

# Step 1: Build a mapping from each RFQ to its top-3 matches
from collections import defaultdict

family_map = defaultdict(set)
for _, row in top3_flat_df_iou.iterrows():
    family_map[row['rfq_id']].add(row['rfq_id'])
    family_map[row['rfq_id']].add(row['match_id'])

# Step 2: Merge overlapping sets to form families
def merge_families(family_map):
    families = []
    seen = set()
    for rfq, members in family_map.items():
        if rfq in seen:
            continue
        family = set(members)
        # Expand family by adding all matches of members
        to_check = set(members)
        while to_check:
            member = to_check.pop()
            if member in family_map:
                new_members = family_map[member] - family
                family.update(new_members)
                to_check.update(new_members)
        families.append(family)
        seen.update(family)
    return families

families = merge_families(family_map)

# Step 3: Print summary interpretation
print(f"Found {len(families)} RFQ families.")
for i, fam in enumerate(families[:5]):  # Show first 5 families
    print(f"Family {i+1}: {len(fam)} RFQs - IDs: {sorted(list(fam))}")

print("Interpretation: Each family groups RFQs that are highly similar (top-3 by IoU score). RFQs in the same family share similar chemical, categorical, and grade properties, indicating they could be processed or quoted similarly.")

Found 819 RFQ families.
Family 1: 4 RFQs - IDs: ['2624655e-ea07-468a-8da8-2e39c9d1e7f0', '267e5bc4-897a-4fc1-97d2-26a45829ba19', '8aff426d-b8c0-43aa-ad26-835ef4de6129', 'b2bc544a-219d-4899-9fa4-191e3f475649']
Family 2: 4 RFQs - IDs: ['25c46875-dbeb-4ef4-ad33-7aff8384fb2a', '2b9db833-2ec8-427e-90f9-3e78078c466e', '37e624be-b125-464f-85b6-1838530193ef', '99bbb98d-d06d-4633-a68c-9be3bb9f6011']
Family 3: 4 RFQs - IDs: ['8fe87807-dae8-4341-94fd-2ab91f176464', '973d80a1-f1b8-461b-bb25-7d8852968b1c', 'b8257184-6307-46ab-b06e-d979336d1263', 'c18a7f01-1cb6-4987-a2cc-93251d3719b9']
Family 4: 3 RFQs - IDs: ['11cffc57-44be-4d79-bfd5-97482be566d3', '63140d1f-dda8-40fe-8931-bcaba65d5772', '75fae2b7-8107-4ce2-a9d5-1189b4cd3b21']
Family 5: 5 RFQs - IDs: ['0777b4fd-292a-439b-848f-d772ee28c3b8', '8fe87807-dae8-4341-94fd-2ab91f176464', '973d80a1-f1b8-461b-bb25-7d8852968b1c', 'b8257184-6307-46ab-b06e-d979336d1263', 'c18a7f01-1cb6-4987-a2cc-93251d3719b9']
Interpretation: Each family groups RFQs that are hi