In [96]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import root_mean_squared_error, r2_score
import scipy.sparse as sp
import plotly.express as px
import numpy as np


In [67]:
file_path = '/Users/zahavalowy/Downloads/myntra_products_catalog.csv'  # Replace with the actual file path
data = pd.read_csv(file_path)


In [68]:
data = data.drop(columns=['PrimaryColor'])


In [73]:
#remove outliers
Q1 = data['Price (INR)'].quantile(0.25)
Q3 = data['Price (INR)'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data_cleaned = data[(data['Price (INR)'] >= lower_bound) & (data['Price (INR)'] <= upper_bound)]


In [85]:
ohe = OneHotEncoder()
product_brand_encoded = ohe.fit_transform(data_cleaned[['ProductBrand']])


In [79]:
le = LabelEncoder()
data_cleaned.loc[:, 'Gender'] = le.fit_transform(data_cleaned['Gender'])




In [87]:
X_gender = data_cleaned[['Gender']].values  
X = sp.hstack([product_brand_encoded, X_gender])



(11324, 638)
(11324, 1)


In [88]:
y = data_cleaned['Price (INR)']

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [92]:
param_grid = {'alpha': [0.1, 1, 10, 100]}

# Instantiate the Ridge model and perform grid search
ridge = Ridge()
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters from Grid Search:", grid_search.best_params_)


Best parameters from Grid Search: {'alpha': 1}


In [97]:

best_ridge_model = grid_search.best_estimator_

y_pred_ridge = best_ridge_model.predict(X_test)

rmse_ridge = np.sqrt(root_mean_squared_error(y_test, y_pred_ridge))
r2_ridge = r2_score(y_test, y_pred_ridge)

print("Ridge Regression - Root Mean Squared Error (RMSE):", rmse_ridge)
print("Ridge Regression - R² Score:", r2_ridge)


Ridge Regression - Root Mean Squared Error (RMSE): 20.01937703871366
Ridge Regression - R² Score: 0.4604608025300758


In [99]:
fig = px.scatter(data_cleaned, x="ProductBrand", y="Price (INR)",
                 title="ProductBrand vs Price",
                 labels={"ProductBrand": "Product Brand", "Price (INR)": "Price in INR"})
fig.show()

brand_avg_price = data_cleaned.groupby('ProductBrand')['Price (INR)'].mean().reset_index()

fig = px.bar(brand_avg_price, x="ProductBrand", y="Price (INR)",
             title="Average Price per Product Brand",
             labels={"ProductBrand": "Product Brand", "Price (INR)": "Average Price in INR"})
fig.show()