In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
import pickle

# Load dataset
df = pd.read_csv("Diamond Price Prediction.csv")

# Rename columns to simplify and make model-friendly
df.rename(columns={
    "Carat(Weight of Daimond)": "carat",
    "Cut(Quality)": "cut",
    "Color": "color",
    "Clarity": "clarity",
    "Depth": "depth",
    "Table": "table",
    "X(length)": "x",
    "Y(width)": "y",
    "Z(Depth)": "z",
    "Price(in US dollars)": "price"
}, inplace=True)

df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [2]:
# Define features and target
X = df.drop("price", axis=1)
y = df["price"]

# Define categorical and numerical columns
categorical_cols = ["cut", "color", "clarity"]
numerical_cols = ["carat", "depth", "table", "x", "y", "z"]

# Preprocessing
preprocessor = ColumnTransformer([
    ("onehot", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ("scale", StandardScaler(), numerical_cols)
])

# Create pipeline
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
pipeline.fit(X_train, y_train)

# Evaluate model
score = pipeline.score(X_test, y_test)
print(f"✅ Model R² Score on Test Data: {score:.4f}")

# Save model
with open("pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

✅ Model R² Score on Test Data: 0.9809
