#  Diamonds price prediction

## 1 load_data

In [7]:
import kagglehub
import os



# Download latest version
path = kagglehub.dataset_download("ulrikthygepedersen/diamonds")



# Print the path to the downloaded dataset files
print("Path to dataset files:", path)


  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\c\.cache\kagglehub\datasets\ulrikthygepedersen\diamonds\versions\1


In [8]:
import pandas as pd
data=pd.read_csv(os.path.join(path,"diamonds.csv"))

In [9]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,'x','y','z'
0,0.23,b'Ideal',b'E',b'SI2',61.5,55.0,326.0,3.95,3.98,2.43
1,0.21,b'Premium',b'E',b'SI1',59.8,61.0,326.0,3.89,3.84,2.31
2,0.23,b'Good',b'E',b'VS1',56.9,65.0,327.0,4.05,4.07,2.31
3,0.29,b'Premium',b'I',b'VS2',62.4,58.0,334.0,4.2,4.23,2.63
4,0.31,b'Good',b'J',b'SI2',63.3,58.0,335.0,4.34,4.35,2.75


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  float64
 7   'x'      53940 non-null  float64
 8   'y'      53940 non-null  float64
 9   'z'      53940 non-null  float64
dtypes: float64(7), object(3)
memory usage: 4.1+ MB


In [11]:
data.describe()

Unnamed: 0,carat,depth,table,price,'x','y','z'
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [12]:
data.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
'x'        0
'y'        0
'z'        0
dtype: int64

## 2 clean_ data

In [13]:
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

In [14]:
import numpy as np
#sklearn.model
from sklearn.model_selection import train_test_split, GridSearchCV
#pre
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
#plot
import seaborn as sns
import matplotlib.pyplot as plt
#
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.svm import SVR


In [15]:
numerical_cols = numerical_cols.tolist()
numerical_cols.remove("price")
X = data.drop('price', axis=1)
y = data['price']

# Define transformers for preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 3 train model

In [17]:
models = [("Linear Regression", LinearRegression()),
          ("Decision Tree", DecisionTreeRegressor()),
          ("Random Forest", RandomForestRegressor()),
          ("Support Vector Regression", SVR())]


In [21]:
results = {}
for model_name, model in models:
    # Create a pipeline for each model
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('regressor', model)])

    # Train model
    model_pipeline.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model_pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    results[model_name] = rmse
    print(f"{model_name} RMSE: {rmse}")


Linear Regression RMSE: 1135.1979088220137
Decision Tree RMSE: 741.5402197466121
Random Forest RMSE: 551.0875222866462


: 

: 

## 4 track_with_MLFlow

In [17]:
print(results)

{'Linear Regression': 1135.1979088220137, 'Decision Tree': 736.7439121834765}


In [18]:
import mlflow

In [3]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000/'


In [4]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

from typing import List
from scipy.sparse import csr_matrix

In [5]:
def extract_x_y(
    df: pd.DataFrame,
    categorical_cols: List[str] = None,
    dv: DictVectorizer = None,
    with_target: bool = True,
) -> dict:

    if categorical_cols is None:
        categorical_cols = ["cut", "color", "clarity"]
    dicts = df[categorical_cols].to_dict(orient="records")

    y = None
    if with_target:
        if dv is None:
            dv = DictVectorizer()
            dv.fit(dicts)
            
        y = df["price"].values

    x = dv.transform(dicts)
    return x, y, dv


#X_train, y_train, dv = extract_x_y(test_df)

In [6]:
import pickle

def load_pickle(path: str):
    with open(path, "rb") as f:
        loaded_obj = pickle.load(f)
    return loaded_obj


def predict_updated(input_path: str, model: LinearRegression):
    input_data = load_pickle(input_path)
    return model.predict(input_data)

In [19]:


# Create a new MLflow Experiment
mlflow.set_experiment("MLflow_track_diamonds")
from mlflow.models import infer_signature



for model_name, model in models:
# Start an MLflow run
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('regressor', model)])

    # Train model
    model_pipeline.fit(X_train, y_train)

    y_pred = model_pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    with mlflow.start_run():

    # Log the loss metric
        mlflow.log_metric("rmse", rmse)

    # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", f"{model_name} for diamonds data")
        
    # Infer the model signature
        
        signature = infer_signature(X_train, model_pipeline.predict(X_train))

    # Log the model
        model_info = mlflow.sklearn.log_model(
        sk_model=model_name,
        artifact_path=f"{model_name}_model",
        signature=signature,
        input_example=X_train,
        registered_model_name="tracking-diamonds",
        )

2025/01/06 17:28:29 INFO mlflow.tracking.fluent: Experiment with name 'MLflow_track_diamonds' does not exist. Creating a new experiment.
Successfully registered model 'tracking-diamonds'.
2025/01/06 17:28:37 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-diamonds, version 1
Created version '1' of model 'tracking-diamonds'.
Registered model 'tracking-diamonds' already exists. Creating a new version of this model...
2025/01/06 17:28:42 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-diamonds, version 2
Created version '2' of model 'tracking-diamonds'.
Registered model 'tracking-diamonds' already exists. Creating a new version of this model...
2025/01/06 17:29:45 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-diamonds, version 3
Created version 

## 6 model deployement


### 6_1 Web Service

#### * use FASTAPI package our model into web service

In [39]:
import os
print(f"Current working directory: {os.getcwd()}")

Current working directory: c:\Users\c\10.9\Final_project


In [72]:
import pickle


from typing import Any

def save_pickle(path: str, obj: Any):
    """Saves the given object to a pickle file."""
    with open(path, "wb") as f:
        pickle.dump(obj, f)

def load_pickle(path: str):
    """Loads a pickle object from the specified file."""
    with open(path, "rb") as f:
        return pickle.load(f)
save_pickle('web_service/local_models/dv__v0.0.1.pkl', dv)  # Save the DictVectorizer
save_pickle('web_service/local_models/model__v0.0.1.pkl', model)

In [68]:
# Load the model
loaded_model = load_pickle('web_service/local_models/model__v0.0.1.pkl')

# Inspect the type of the model
print(type(loaded_model))  # Example: <class 'sklearn.ensemble._forest.RandomForestRegressor'>

# Access model parameters (if it is a scikit-learn model)
if hasattr(loaded_model, 'get_params'):
    print(loaded_model.get_params())


<class 'sklearn.ensemble._forest.RandomForestRegressor'>
{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [45]:
train_df=(X_train,y_train)
test_df=(X_test,y_test)

In [80]:
train_df=data[:40000]
test_df=data[40000:]

In [51]:

def encode_categorical_cols(
    df: pd.DataFrame, categorical_cols: List[str] = None
) -> pd.DataFrame:
    if categorical_cols is None:
        categorical_cols = ["cut", "color", "clarity"]
    df[categorical_cols] = df[categorical_cols].fillna(-1).astype("int")
    df[categorical_cols] = df[categorical_cols].astype("str")
    return df




In [97]:
from sklearn.preprocessing import LabelEncoder

def encode_categorical_cols(df, categorical_cols=None):
    if categorical_cols is None:
        categorical_cols = ["cut", "color", "clarity"]
    
    for col in categorical_cols:
        df[col] = df[col].fillna("Unknown")  # Handle missing values
        label_encoder = LabelEncoder()
        df[col] = label_encoder.fit_transform(df[col].astype("str"))
    
    return df


In [57]:
from scipy.sparse import csr_matrix, csc_matrix


In [100]:
mlflow_experiment_path = f"/mlflow/diamond_predict"
mlflow.set_experiment(mlflow_experiment_path)

# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tag("Level", "Development")
    mlflow.set_tag("Team", "Data Science")

    # Load data
    
  
    mlflow.log_param("train_set_size", train_df.shape[0])
    mlflow.log_param("test_set_size", test_df.shape[0])

    
   

# Ensure X_train is dense
    if isinstance(X_train, (csr_matrix, csc_matrix)):
        X_train = X_train.toarray()

# Create and train the model
    model = RandomForestRegressor()
    model.fit(X_train, y_train)

# Predict on the training set (or a test set)


   

    # Encode categorical columns
    train_df = encode_categorical_cols(train_df)

    # Extract X and y
    X_train, y_train, _ = extract_x_y(train_df)
    if isinstance(X_train, (csr_matrix, csc_matrix)):
       X_train = X_train.toarray()

# Create and train the model
    model = RandomForestRegressor()
    model.fit(X_train, y_train)


    # Train model
 

    # Evaluate model
    y_pred = model.predict(X_train)
    mse = mean_squared_error(y_train, y_pred)
    rmse = np.sqrt(mse)
    
    mlflow.log_metric("train_me", rmse)

    # Evaluate model on test set
    
    test_df = encode_categorical_cols(test_df)
    # Train data
    X_train, y_train, dv = extract_x_y(train_df)

# Test data using the same dv
   

    X_test, y_test, dv = extract_x_y(test_df, dv=dv)
    #dmatrix_data = xgb.DMatrix(X_test)
    #y_pred_test = model_xgboost.predict(dmatrix_data)
    y_pred_test = model.predict(X_test)
    test_me = mean_squared_error(y_test, y_pred_test)
    rmse = np.sqrt(test_me)
    mlflow.log_metric("test_me", rmse)

    # Log your model
    mlflow.sklearn.log_model(model, "models")

    # Register your model as the production model
    mlflow.register_model(f"runs:/{run_id}/models", "RandomForestRegressor_test")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna("Unknown")  # Handle missing values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = label_encoder.fit_transform(df[col].astype("str"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna("Unknown")  # Handle missing values
A value is trying to b

In [70]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# List registered models
for model in client.search_registered_models():
    print(model)

# List all versions for the specific model
model_name = "RandomForestRegressor_test"
model_versions = client.get_registered_model(name=model_name)
for version in model_versions.latest_versions:
    print(f"Version: {version.version}, Stage: {version.current_stage}")


<RegisteredModel: aliases={}, creation_timestamp=1736132869984, description='', last_updated_timestamp=1736133658367, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1736132870014, current_stage='None', description='', last_updated_timestamp=1736132870014, name='RandomForestRegressor_test', run_id='3a9e741bbe104fa4b91640fdd5e943ba', run_link='', source='mlflow-artifacts:/958600936449629552/3a9e741bbe104fa4b91640fdd5e943ba/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>,
 <ModelVersion: aliases=[], creation_timestamp=1736133029433, current_stage='Production', description='', last_updated_timestamp=1736133658367, name='RandomForestRegressor_test', run_id='ec311e061e204c28a60c9aca47cfa407', run_link='', source='mlflow-artifacts:/958600936449629552/ec311e061e204c28a60c9aca47cfa407/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>], name='RandomForestRegressor_test', tags={}>
<RegisteredModel: alias

In [69]:
client = MlflowClient()
production_version = 2

client.transition_model_version_stage(
    name="RandomForestRegressor_test", version=production_version, stage="Production"
)

<ModelVersion: aliases=[], creation_timestamp=1736133029433, current_stage='Production', description='', last_updated_timestamp=1736133658367, name='RandomForestRegressor_test', run_id='ec311e061e204c28a60c9aca47cfa407', run_link='', source='mlflow-artifacts:/958600936449629552/ec311e061e204c28a60c9aca47cfa407/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>