In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr

In [2]:
rfq_path = "rfq.csv"
ref_path = "reference_properties.tsv"

rfq = pd.read_csv(rfq_path)
reference = pd.read_csv(ref_path, sep="\t")

rfq_info = {
    "shape": rfq.shape,
    "columns": rfq.columns.tolist(),
    "sample": rfq.head(5).to_dict(orient="records")
}

ref_info = {
    "shape": reference.shape,
    "columns": reference.columns.tolist(),
    "sample": reference.head(5).to_dict(orient="records")
}

rfq_info


{'shape': (1000, 25),
 'columns': ['id',
  'grade',
  'grade_suffix',
  'coating',
  'finish',
  'surface_type',
  'surface_protection',
  'form',
  'thickness_min',
  'thickness_max',
  'width_min',
  'width_max',
  'length_min',
  'height_min',
  'height_max',
  'weight_min',
  'weight_max',
  'inner_diameter_min',
  'inner_diameter_max',
  'outer_diameter_min',
  'outer_diameter_max',
  'yield_strength_min',
  'yield_strength_max',
  'tensile_strength_min',
  'tensile_strength_max'],
 'sample': [{'id': '8aff426d-b8c0-43aa-ad26-835ef4de6129',
   'grade': 'S700MC',
   'grade_suffix': nan,
   'coating': nan,
   'finish': 'Oiled',
   'surface_type': nan,
   'surface_protection': nan,
   'form': 'Coils',
   'thickness_min': 6.0,
   'thickness_max': 6.0,
   'width_min': 600.0,
   'width_max': 1520.0,
   'length_min': nan,
   'height_min': nan,
   'height_max': nan,
   'weight_min': 15000.0,
   'weight_max': 25000.0,
   'inner_diameter_min': 610.0,
   'inner_diameter_max': 610.0,
   'outer

In [3]:
ref_info

{'shape': (175, 34),
 'columns': ['Grade/Material',
  'UNS_No',
  'Steel_No',
  'Standards',
  'Carbon (C)',
  'Manganese (Mn)',
  'Silicon (Si)',
  'Sulfur (S)',
  'Phosphorus (P)',
  'Chromium (Cr)',
  'Nickel (Ni)',
  'Molybdenum (Mo)',
  'Vanadium (V)',
  'Tungsten (W)',
  'Cobalt (Co)',
  'Copper (Cu)',
  'Aluminum (Al)',
  'Titanium (Ti)',
  'Niobium (Nb)',
  'Boron (B)',
  'Nitrogen (N)',
  'Tensile strength (Rm)',
  'Yield strength (Re or Rp0.2)',
  'Elongation (A%)',
  'Reduction of area (Z%)',
  'Hardness (HB, HV, HRC)',
  'Impact toughness (Charpy V-notch)',
  'Fatigue limit',
  'Creep resistance',
  'Source_Pages',
  'Application',
  'Category',
  'Nb + V + Ti (Others)',
  'Coating'],
 'sample': [{'Grade/Material': 'S235JR',
   'UNS_No': nan,
   'Steel_No': nan,
   'Standards': 'EN 10025-2:2019',
   'Carbon (C)': '≤0.17',
   'Manganese (Mn)': '≤1.40',
   'Silicon (Si)': '≤0.40',
   'Sulfur (S)': '≤0.035',
   'Phosphorus (P)': '≤0.035',
   'Chromium (Cr)': nan,
   'Nickel (N

### Task B.1:

In [4]:
def normalize_grade(grade, suffix=None):
    """
    Normalize grade keys by:
    - Uppercasing
    - Stripping spaces
    - Appending suffix if present
    """
    if pd.isna(grade):
        return None
    grade_norm = str(grade).strip().upper()
    if suffix and not pd.isna(suffix):
        grade_norm += str(suffix).strip().upper()
    return grade_norm

rfq["grade_normalized"] = rfq.apply(lambda row: normalize_grade(row["grade"], row["grade_suffix"]), axis=1)
reference["grade_normalized"] = reference["Grade/Material"].astype(str).str.strip().str.upper()


def parse_range(value):
    """
    Parse string ranges/inequalities into numeric (min, max).
    Handles cases like:
    - "360-510 MPa" -> (360, 510)
    - "≥235 MPa"   -> (235, np.inf)
    - "≤0.17"      -> (-np.inf, 0.17)
    - "≤0.035%"    -> (-np.inf, 0.035)
    - "0.17"       -> (0.17, 0.17)
    """
    if pd.isna(value):
        return (np.nan, np.nan)
    
    text = str(value).replace(",", ".")  # unify decimals
    numbers = re.findall(r"[-+]?\d*\.?\d+", text)
    
    if "-" in text and len(numbers) == 2:  # range like "360-510"
        return (float(numbers[0]), float(numbers[1]))
    elif "≥" in text and numbers:
        return (float(numbers[0]), np.inf)
    elif "≤" in text and numbers:
        return (-np.inf, float(numbers[0]))
    elif numbers:
        return (float(numbers[0]), float(numbers[0]))
    else:
        return (np.nan, np.nan)


# Apply parsing to selected mechanical columns
for col in ["Tensile strength (Rm)", "Yield strength (Re or Rp0.2)", "Carbon (C)", "Manganese (Mn)", "Silicon (Si)", "Sulfur (S)", "Phosphorus (P)"]:
    if col in reference.columns:
        reference[[f"{col}_min", f"{col}_max"]] = reference[col].apply(lambda x: pd.Series(parse_range(x)))



rfq_ref_joined = rfq.merge(reference, on="grade_normalized", how="left")

# Flag unmatched
rfq_ref_joined["matched_reference"] = ~rfq_ref_joined["Grade/Material"].isna()

In [5]:
rfq_ref_joined.head(10)

Unnamed: 0,id,grade,grade_suffix,coating,finish,surface_type,surface_protection,form,thickness_min,thickness_max,...,Carbon (C)_max,Manganese (Mn)_min,Manganese (Mn)_max,Silicon (Si)_min,Silicon (Si)_max,Sulfur (S)_min,Sulfur (S)_max,Phosphorus (P)_min,Phosphorus (P)_max,matched_reference
0,8aff426d-b8c0-43aa-ad26-835ef4de6129,S700MC,,,Oiled,,,Coils,6.0,6.0,...,0.12,-inf,2.1,-inf,0.6,-inf,0.015,-inf,0.025,True
1,37e624be-b125-464f-85b6-1838530193ef,S250GD,,ZM310,Hot-dip zinc magnesium (+ZM),,,Slit Coils,1.5,1.5,...,0.25,-inf,1.2,,,-inf,0.045,-inf,0.12,True
2,b8257184-6307-46ab-b06e-d979336d1263,DX51D,,Z100,Hot-dip Galvanized (+Z/+GI),,Lightly Oiled (L),Coils,0.4,0.4,...,0.12,-inf,0.6,,,-inf,0.045,-inf,0.12,True
3,63140d1f-dda8-40fe-8931-bcaba65d5772,S235,,,,,,Round Tubes,1.5,1.5,...,0.17,-inf,1.4,-inf,0.4,-inf,0.035,-inf,0.035,True
4,11cffc57-44be-4d79-bfd5-97482be566d3,S235,,,,,,Round Tubes,1.5,1.5,...,0.17,-inf,1.4,-inf,0.4,-inf,0.035,-inf,0.035,True
5,75fae2b7-8107-4ce2-a9d5-1189b4cd3b21,S235,,,,,,Round Tubes,1.5,1.5,...,0.17,-inf,1.4,-inf,0.4,-inf,0.035,-inf,0.035,True
6,973d80a1-f1b8-461b-bb25-7d8852968b1c,DX51D,,Z075,Hot-dip Galvanized (+Z/+GI),,,Rectangular Tubes,2.0,2.0,...,0.12,-inf,0.6,,,-inf,0.045,-inf,0.12,True
7,8fe87807-dae8-4341-94fd-2ab91f176464,DX51D,,Z080,Hot-dip Galvanized (+Z/+GI),,,Rectangular Tubes,2.0,2.0,...,0.12,-inf,0.6,,,-inf,0.045,-inf,0.12,True
8,c18a7f01-1cb6-4987-a2cc-93251d3719b9,DX51D,,,,,,Round Tubes,2.5,2.5,...,0.12,-inf,0.6,,,-inf,0.045,-inf,0.12,True
9,0777b4fd-292a-439b-848f-d772ee28c3b8,DX51D,,,,,,Equal Angles,1.5,1.5,...,0.12,-inf,0.6,,,-inf,0.045,-inf,0.12,True


In [6]:
rfq_ref_joined_info = {
    "shape": rfq_ref_joined.shape,
    "columns": rfq_ref_joined.columns.tolist(),
    "sample": rfq_ref_joined.head(5).to_dict(orient="records")
}
rfq_ref_joined_info

{'shape': (1005, 75),
 'columns': ['id',
  'grade',
  'grade_suffix',
  'coating',
  'finish',
  'surface_type',
  'surface_protection',
  'form',
  'thickness_min',
  'thickness_max',
  'width_min',
  'width_max',
  'length_min',
  'height_min',
  'height_max',
  'weight_min',
  'weight_max',
  'inner_diameter_min',
  'inner_diameter_max',
  'outer_diameter_min',
  'outer_diameter_max',
  'yield_strength_min',
  'yield_strength_max',
  'tensile_strength_min',
  'tensile_strength_max',
  'grade_normalized',
  'Grade/Material',
  'UNS_No',
  'Steel_No',
  'Standards',
  'Carbon (C)',
  'Manganese (Mn)',
  'Silicon (Si)',
  'Sulfur (S)',
  'Phosphorus (P)',
  'Chromium (Cr)',
  'Nickel (Ni)',
  'Molybdenum (Mo)',
  'Vanadium (V)',
  'Tungsten (W)',
  'Cobalt (Co)',
  'Copper (Cu)',
  'Aluminum (Al)',
  'Titanium (Ti)',
  'Niobium (Nb)',
  'Boron (B)',
  'Nitrogen (N)',
  'Tensile strength (Rm)',
  'Yield strength (Re or Rp0.2)',
  'Elongation (A%)',
  'Reduction of area (Z%)',
  'Hardnes

### Task B.2:

In [7]:
#Interval IoU
def interval_iou(r_min, r_max, s_min, s_max):
    if any(pd.isna([r_min, r_max, s_min, s_max])):
        return np.nan
    inter = max(0, min(r_max, s_max) - max(r_min, s_min))
    union = (r_max - r_min) + (s_max - s_min) - inter
    if union <= 0:
        return 0.0
    return inter / union

dimension_pairs = [
    ("thickness_min", "thickness_max"),
    ("width_min", "width_max"),
    ("length_min", "length_min"),
    ("height_min", "height_max"),
    ("weight_min", "weight_max"),
    ("inner_diameter_min", "inner_diameter_max"),
    ("outer_diameter_min", "outer_diameter_max"),
    ("yield_strength_min", "yield_strength_max"),
    ("tensile_strength_min", "tensile_strength_max")
]

for dmin, dmax in dimension_pairs:
    rfq_ref_joined[f"{dmin.split('_')[0]}_interval_iou"] = rfq_ref_joined.apply(
        lambda row: interval_iou(row[dmin], row[dmax], row[dmin], row[dmax]), axis=1
    )

#Categorical matches
categorical_cols = ["coating", "finish", "form", "surface_type", "surface_protection"]
for col in categorical_cols:
    rfq_ref_joined[f"{col}_match"] = (rfq_ref_joined[f"{col}"].notna()).astype(int)

#Grade property midpoints
def midpoint(row):
    if pd.isna(row[0]) or pd.isna(row[1]):
        return np.nan
    if row[0] == -np.inf and row[1] != np.inf:
        return row[1]
    if row[1] == np.inf and row[0] != -np.inf:
        return row[0]
    if row[0] == -np.inf and row[1] == np.inf:
        return np.nan
    return (row[0] + row[1]) / 2

midpoint_features = []
for col in ["Tensile strength (Rm)", "Yield strength (Re or Rp0.2)", "Carbon (C)", "Manganese (Mn)", "Silicon (Si)", "Sulfur (S)", "Phosphorus (P)"]:
    if f"{col}_min" in rfq_ref_joined.columns and f"{col}_max" in rfq_ref_joined.columns:
        mid_col = col.replace(" ", "_").replace("(", "").replace(")", "").replace("%", "pct").replace("/", "_").replace(".", "").replace(",", "")
        rfq_ref_joined[f"{mid_col}_mid"] = rfq_ref_joined[[f"{col}_min", f"{col}_max"]].apply(midpoint, axis=1)
        midpoint_features.append(f"{mid_col}_mid")

sparsity = rfq_ref_joined[midpoint_features].isna().mean()
dense_features = sparsity[sparsity < 0.8].index.tolist()

engineered = rfq_ref_joined[
    ["id", "grade_normalized", "matched_reference"]
    + [f"{dmin.split('_')[0]}_interval_iou" for dmin, dmax in dimension_pairs]
    + [f"{col}_match" for col in categorical_cols]
    + dense_features
]

  if pd.isna(row[0]) or pd.isna(row[1]):
  if row[0] == -np.inf and row[1] != np.inf:
  if row[1] == np.inf and row[0] != -np.inf:
  if row[0] == -np.inf and row[1] == np.inf:
  return (row[0] + row[1]) / 2
  if pd.isna(row[0]) or pd.isna(row[1]):
  if row[0] == -np.inf and row[1] != np.inf:
  if row[1] == np.inf and row[0] != -np.inf:
  return row[0]
  return row[1]
  if row[0] == -np.inf and row[1] == np.inf:
  return (row[0] + row[1]) / 2
  if pd.isna(row[0]) or pd.isna(row[1]):
  if row[0] == -np.inf and row[1] != np.inf:
  return row[1]
  if row[1] == np.inf and row[0] != -np.inf:
  if row[0] == -np.inf and row[1] == np.inf:
  return (row[0] + row[1]) / 2
  if pd.isna(row[0]) or pd.isna(row[1]):
  if row[0] == -np.inf and row[1] != np.inf:
  return row[1]
  if row[1] == np.inf and row[0] != -np.inf:
  if row[0] == -np.inf and row[1] == np.inf:
  return (row[0] + row[1]) / 2
  if pd.isna(row[0]) or pd.isna(row[1]):
  if row[0] == -np.inf and row[1] != np.inf:
  return row[1]
  if r

In [8]:
engineered.columns

Index(['id', 'grade_normalized', 'matched_reference', 'thickness_interval_iou',
       'width_interval_iou', 'length_interval_iou', 'height_interval_iou',
       'weight_interval_iou', 'inner_interval_iou', 'outer_interval_iou',
       'yield_interval_iou', 'tensile_interval_iou', 'coating_match',
       'finish_match', 'form_match', 'surface_type_match',
       'surface_protection_match', 'Tensile_strength_Rm_mid',
       'Yield_strength_Re_or_Rp02_mid', 'Carbon_C_mid', 'Manganese_Mn_mid',
       'Silicon_Si_mid', 'Sulfur_S_mid', 'Phosphorus_P_mid'],
      dtype='object')

In [9]:
engineered = engineered.fillna(0)

### Task B.3 (along with additional ablation study)

In [10]:
# Select feature groups
dim_features   = [c for c in engineered.columns if c.endswith("_interval_iou")]
cat_features   = [c for c in engineered.columns if c.endswith("_match")]
grade_features = [c for c in engineered.columns if c.endswith("_mid")]

def compute_similarity(df, dim_w=0.4, cat_w=0.3, grade_w=0.3):
    """Return weighted similarity matrix for RFQs."""
    df_num = df.fillna(0)  # to avoid NaN issues

    dim_sim   = cosine_similarity(df_num[dim_features]) if dim_features else np.zeros((len(df), len(df)))
    cat_sim   = cosine_similarity(df_num[cat_features]) if cat_features else np.zeros((len(df), len(df)))
    grade_sim = cosine_similarity(df_num[grade_features]) if grade_features else np.zeros((len(df), len(df)))

    sim_matrix = dim_w * dim_sim + cat_w * cat_sim + grade_w * grade_sim
    return sim_matrix

def top3_matches(df, sim_matrix, id_col="id"):
    """Extract top-3 matches per RFQ given a similarity matrix."""
    ids = df[id_col].values
    results = []

    for i, rfq_id in enumerate(ids):
        sim_scores = [(j, sim_matrix[i, j]) for j in range(len(ids)) if j != i]
        top3 = sorted(sim_scores, key=lambda x: x[1], reverse=True)[:3]
        for j, s in top3:
            results.append({"rfq_id": rfq_id, "match_id": ids[j], "similarity_score": s})

    return pd.DataFrame(results)

# Example: full similarity
sim_matrix = compute_similarity(engineered, dim_w=0.4, cat_w=0.3, grade_w=0.3)
top3_df = top3_matches(engineered, sim_matrix)
top3_df.to_csv('top3.csv')

In [11]:
top3_df.head()

Unnamed: 0,rfq_id,match_id,similarity_score
0,8aff426d-b8c0-43aa-ad26-835ef4de6129,a462a4cb-bbaa-4417-b876-4b8606c6f8db,0.946167
1,8aff426d-b8c0-43aa-ad26-835ef4de6129,2624655e-ea07-468a-8da8-2e39c9d1e7f0,0.926599
2,8aff426d-b8c0-43aa-ad26-835ef4de6129,b30cef8b-4065-4272-89b5-57b1fd91c50d,0.926591
3,37e624be-b125-464f-85b6-1838530193ef,99bbb98d-d06d-4633-a68c-9be3bb9f6011,0.6
4,37e624be-b125-464f-85b6-1838530193ef,973d80a1-f1b8-461b-bb25-7d8852968b1c,0.59988


In [12]:
top3_df[top3_df['rfq_id']=="a462a4cb-bbaa-4417-b876-4b8606c6f8db"]

Unnamed: 0,rfq_id,match_id,similarity_score
66,a462a4cb-bbaa-4417-b876-4b8606c6f8db,3b00dedd-fcb8-4d71-9ffb-40b92b6cdfe7,0.946196
67,a462a4cb-bbaa-4417-b876-4b8606c6f8db,8aff426d-b8c0-43aa-ad26-835ef4de6129,0.946167
68,a462a4cb-bbaa-4417-b876-4b8606c6f8db,ca1ccf90-ef97-446d-b9ba-214844ffc371,0.944949


### From the outputs above, it is verified that the similarity score remains the same if an element from the 'match_id' column is given as input to the 'rfq_id' column. 
### The rfq_id 8aff426d-b8c0-43aa-ad26-835ef4de6129 has its top match a462a4cb-bbaa-4417-b876-4b8606c6f8db with a similarity score of 0.9461. The same is the case when a462a4cb-bbaa-4417-b876-4b8606c6f8db is given as the rfq_id.

In [13]:
# Only dimensions
sim_dim = compute_similarity(engineered, dim_w=1.0, cat_w=0.0, grade_w=0.0)
top3_dim = top3_matches(engineered, sim_dim)

# Only categorical
sim_cat = compute_similarity(engineered, dim_w=0.0, cat_w=1.0, grade_w=0.0)
top3_cat = top3_matches(engineered, sim_cat)

# Only grade
sim_grade = compute_similarity(engineered, dim_w=0.0, cat_w=0.0, grade_w=1.0)
top3_grade = top3_matches(engineered, sim_grade)

In [None]:
def compare_top3(baseline, variant):
    """
    Compare two top3 DataFrames (baseline vs variant).
    Returns dict with rank correlation, and score difference.
    """
    merged = pd.merge(
        baseline, variant,
        on=["rfq_id", "match_id"],
        suffixes=("_base", "_var")
    )
    
    
    #Spearman rank correlation (if enough pairs)
    if len(merged) > 1:
        rho, _ = spearmanr(merged["similarity_score_base"], merged["similarity_score_var"])
    else:
        rho = np.nan
    
    #Average absolute score difference
    avg_diff = (merged["similarity_score_base"] - merged["similarity_score_var"]).abs().mean()
    
    return {
        "spearman_rho": rho,
        "avg_score_diff": avg_diff
    }

#Run ablation comparisons
results = {}
for name, df_var in {
    "Dim_only": top3_dim,
    "Cat_only": top3_cat,
    "Grade_only": top3_grade,
}.items():
    results[name] = compare_top3(top3_df, df_var)

ablation_summary = pd.DataFrame(results).T.reset_index().rename(columns={"index": "variant"})

print(ablation_summary)


      variant  spearman_rho  avg_score_diff
0    Dim_only      0.488354        0.075341
1    Cat_only      0.258989        0.417785
2  Grade_only      0.530525        0.334827
