In [None]:
#!pip install mlflow

In [3]:
pip show mlflow

[0mNote: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt #data visualization
import seaborn as sns #data visualization
import missingno as msno
import mlflow
import mlflow.sklearn

# Data Pre Processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

# Regressor Libraries
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


#Evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'mlflow'

In [None]:
# read in data with specifying that there is no header columns

df=pd.read_csv("../data/Jewelry_Dataset.csv", header=None)
df.head()

In [None]:
# created my columns and removed scientific notation
df.columns= ['Datetime','OrderID','Purchased_ID','Qty_of_SKU','Category_ID','Category_alias','Brand_ID',
             'Price','UserID','Product_gender','Color','Metal','Gem_type']
pd.set_option('display.float_format', '{:.0f}'.format)
df.head(50)

In [None]:
# format datetime to date function
df['Datetime'] = pd.to_datetime(df['Datetime'])

In [None]:
df.info()

In [None]:
# sum up missing values
df.isna().sum()

In [None]:
# explore the most common values in each column of your DataFrame, including missing values (NaN)
for col in df.columns:
    print(f"\nTop values in column: {col}")
    print(df[col].value_counts(dropna=False).head(10))

In [None]:
# removed any numerical values in this column and then reset index
df = df[~df['Category_alias'].apply(lambda x: str(x).replace('.', '', 1).isdigit())]
df.reset_index(drop=True, inplace=True)

In [None]:
df['Category_alias'] = df['Category_alias'].fillna("unknown")
df['Purchased_ID'] = df['Purchased_ID'].fillna("unknown")
df['Gem_type'] = df['Gem_type'].fillna("unknown")
df['Brand_ID'] = df['Brand_ID'].fillna("unknown")
df['Price'] = df['Price'].fillna("unknown")
df['Metal'] = df['Metal'].fillna("unknown")
df['Color'] = df['Color'].fillna("unknown")
df['Color'] = df['Color'].replace('unknown-color', 'unknown')
df['Product_gender'] = df['Product_gender'].fillna("unknown")

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.drop(columns=['Datetime', 'Qty_of_SKU','Category_ID','UserID','OrderID','Purchased_ID'], inplace=True)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include=["object","bool"])

In [None]:
msno.bar(df, color='sandybrown')

In [None]:
plt.figure(figsize =(10,3))
sns.heatmap(df.isnull(), cbar=True, cmap="Blues_r")

In [None]:

num_col = df.select_dtypes(include="number").columns
plt.figure(figsize=(15, 5))
for idx, col in enumerate(num_col, 1):
    plt.subplot(1,2,idx)
    sns.histplot(df[col], bins=2, color="teal")
    # ax[idx].set_title(f"Histplot for {col}")
    plt.title(f"Histplot for {col}")
plt.tight_layout()
plt.show()

In [None]:
num_col = df.select_dtypes(include="number").columns
plt.figure(figsize=(15, 5))
for idx, col in enumerate(num_col, 1):
    plt.subplot(1,2,idx)
    sns.boxplot(df[col], color="teal")
    # ax[idx].set_title(f"Histplot for {col}")
    plt.title(f"Boxplot for {col}")
plt.tight_layout()
plt.show()

In [None]:
affected_column = [ "Price"]
q1 = df[affected_column].quantile(0.25)
q3 = df[affected_column].quantile(0.75)


In [None]:
iqr = q3 - q1

iqr

In [None]:
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)


In [None]:
df[affected_column] = df[affected_column].clip(lower=lower_bound, upper=upper_bound, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
sns.boxplot(y=df['Price'], ax=ax, color='teal')
ax.set_title("Boxplot for Price")
plt.show()

# Univariate Analysis

In [None]:
category_counts = df['Category_alias'].value_counts()

category_counts.sort_values().plot(
    kind='barh',
    figsize=(10, 6),
    color='gold',
    edgecolor='black'
)

plt.xlabel('Count')
plt.ylabel('Category Alias')
plt.title('Top Category Alias Distribution', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Count the values in the 'Brand_ID' column
Brand_counts = df['Brand_ID'].value_counts()

Brand_counts.sort_values().plot(
    kind='barh',
    figsize=(10, 6),
    color='maroon',
    edgecolor='black'
)

plt.xlabel('Count')
plt.ylabel('Brand Alias')
plt.title('Top Brand Alias Distribution', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
#Data Visualization
def price_bracket(Price):
    if Price <=100:
        return "Low Price(<=100)"
    elif Price <=250:
        return "Medium Price(<=250)"
    elif Price <=500:
        return "High Price(<=500)"
    else:
        return "Expensive (>500)"
df['price_bracket'] =df['Price'].apply(price_bracket)
plt.figure(figsize =(10,5))
sns.countplot(x='price_bracket', data=df)
plt.xlabel('Price Range')
plt.ylabel('Count of Price Group')
plt.title('Pricing')

In [None]:
Gem_counts = df['Gem_type'].value_counts()

Gem_counts.sort_values().plot(
    kind='barh',
    figsize=(10, 6),
    color='purple',
    edgecolor='black'
)

plt.xlabel('Count')
plt.ylabel('Gem Type Alias')
plt.title('Gem Type Distribution', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Bivariate Analysis

In [None]:
plt.figure(figsize =(10,5))
sns.countplot(x='price_bracket', data=df, hue='Metal')
plt.xlabel('Price Group')
plt.ylabel('Count of Metals')
plt.title('Price of Metals')

In [None]:
# Get unique colors and include 'unknown'
unique_colors = df['Color'].dropna().unique()

# Create the base palette from actual color names
color_palette = {color: color for color in unique_colors if color.lower() != 'unknown'}

# Add 'unknown' with a custom color
color_palette['unknown'] = 'gray'

plt.figure(figsize=(12, 6))
sns.countplot(
    x='price_bracket',
    data=df,
    hue='Color',
    palette=color_palette,
    edgecolor='black'
)

plt.xlabel('Price Group', fontsize=12)
plt.ylabel('Count of Colors', fontsize=12)
plt.title('Color Distribution by Price Bracket', fontsize=14, fontweight='bold')
plt.xticks(rotation=45)
plt.legend(title='Color', title_fontsize=11, fontsize=10, loc='upper right')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize =(10,5))
sns.countplot(x='price_bracket', data=df, hue='Product_gender')
plt.xlabel('Price Group')
plt.ylabel('Count of Product Gender')
plt.title('Price Per Gender')

In [None]:
# Step 1: Get top 11 gem_types by frequency
top_gems = df['Gem_type'].value_counts().nlargest(11).index

df['Gem_type_grouped'] = df['Gem_type'].apply(lambda x: x if x in top_gems else 'Other')

# Step 2: Filter DataFrame for top 11 gem types
filtered_df = df[df['Gem_type'].isin(top_gems)]

# Step 3: Create a unique color for each gem type
unique_colors = sns.color_palette("hls", len(top_gems))
color_dict = dict(zip(top_gems, unique_colors))

# Step 4: Plot the bar chart
plt.figure(figsize=(10, 6))
sns.countplot(data=filtered_df, x='price_bracket', hue='Gem_type', palette=color_dict)

plt.title('Top 11 Gem Types by Pricing Range')
plt.xlabel('Pricing Range')
plt.ylabel('Count')
plt.legend(title='Gem Type')
plt.tight_layout()
plt.show()

In [None]:
df['Brand_ID'] = df['Brand_ID'].astype(str)

In [None]:
df.head()

In [None]:
df.duplicated().sum()

In [None]:
# Label encoding for binary categories
Le = LabelEncoder()

df['Product_gender'] = Le.fit_transform(df['Product_gender'])
df['Category_alias'] = Le.fit_transform(df['Category_alias'])
df['Metal'] = Le.fit_transform(df['Metal'])
df['Color'] = Le.fit_transform(df['Color'])
df['Brand_ID'] = Le.fit_transform(df['Brand_ID'])
df['Gem_type_grouped'] = Le.fit_transform(df['Gem_type_grouped'])

label=df[['Price']]


df.drop(['Gem_type','price_bracket','Price'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
label.head()

In [None]:
df.dtypes

# Machine Learning

In [None]:
# Split the dataset into training and testing sets - X = Questions, y = Answers
X_train, X_test, y_train, y_test = train_test_split(df, label, test_size=0.2, random_state=42)

In [None]:
X_train.head(3)

In [None]:
y_train.head(3)

In [None]:
X_test.head(3)

In [None]:
y_test.head(3)

In [None]:
# Define regressors and their names
regressors = [
    (LinearRegression(), "Linear Regression"),
    (RandomForestRegressor(), "Random Forest"),
    (DecisionTreeRegressor(), "Decision Tree")
]

In [None]:
mse_list = {}
mae_list = {}
r2_list = {}

for model, name in regressors:
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    mse = mean_squared_error(y_test, pred)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)

    mse_list[name] = round(mse, 2)
    mae_list[name] = round(mae, 2)
    r2_list[name] = round(r2, 4)

print("Mean Squared Error (MSE):", mse_list)
print("Mean Absolute Error (MAE):", mae_list)
print("R^2 Score:", r2_list)
