# Read Dataset

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
train_df = pd.read_csv("diamonds train.csv")
test_df = pd.read_csv("diamonds test.csv")

# Basic Info & Stats


In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(train_df['total_sales_price'], kde=True)
plt.title('Distribution of Total Sales Price')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(data=train_df, x='cut')
plt.title('Count of Cut Types')
plt.xlabel('Cut')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(data=train_df, x='clarity', y='total_sales_price')
plt.title('Price by Clarity')
plt.xlabel('Clarity')
plt.ylabel('Price')
plt.xticks(rotation=45)
plt.show()

# Drop Irrelevant & Heavy Null Values Column

In [None]:
drop_cols = [
    'Unnamed: 0',
    'fancy_color_dominant_color',
    'fancy_color_secondary_color',
    'fancy_color_overtone',
    'fancy_color_intensity'
]

train_df = train_df.drop(columns=drop_cols)
test_df = test_df.drop(columns=drop_cols)


### Drop Drop duplicated rows ###

In [None]:
train_df = train_df.drop_duplicates()

# Replace Outliers with Upper or Lower whisker accordingly

In [None]:
numeric_cols = train_df.select_dtypes(include=['int64', 'float64']).columns

for col in numeric_cols:
    Q1 = train_df[col].quantile(0.25)
    Q3 = train_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    train_df[col] = np.where(train_df[col] < lower, lower,
                             np.where(train_df[col] > upper, upper, train_df[col]))


# Convert Categorical to Numerical Data

In [None]:
categorical_cols = train_df.select_dtypes(include='object').columns

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_cat = pd.DataFrame(encoder.fit_transform(train_df[categorical_cols]))
X_cat.columns = encoder.get_feature_names_out(categorical_cols)
X_cat.index = train_df.index

In [None]:
train_df = train_df.drop(columns=categorical_cols)
train_df = pd.concat([train_df, X_cat], axis=1)

In [None]:
X_test_cat = pd.DataFrame(encoder.transform(test_df[categorical_cols]))
X_test_cat.columns = encoder.get_feature_names_out(categorical_cols)
X_test_cat.index = test_df.index

In [None]:
test_df = test_df.drop(columns=categorical_cols)
test_df = pd.concat([test_df, X_test_cat], axis=1)

# Split Data to X & Y

In [None]:
X = train_df.drop(columns='total_sales_price')
y = train_df['total_sales_price']

# Scale your input Data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


# Create regression models

#### Linear Regression
#### Ridge Regression
#### Lasso Regression
#### Elastic Net Regression
#### Decision Tree Regressor
#### Random Forest Regressor
#### SVR
#### Boosting Regressor
#### KNN Regressor

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Elastic Net': ElasticNet(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'SVR': SVR(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'KNN Regressor': KNeighborsRegressor()
}

# Choose model with lowest RMSE

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse_scores = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores[name] = rmse

best_model_name = min(rmse_scores, key=rmse_scores.get)
best_model = models[best_model_name]

print("Best Model:", best_model_name)
print("RMSE:", rmse_scores[best_model_name])


# Save Best Model

In [None]:
import joblib

joblib.dump(best_model, 'best_model.pkl')


# Predict on testing data 

In [None]:
X_test_scaled = scaler.transform(test_df)
test_predictions = best_model.predict(X_test_scaled)


# Save only prediction as .xlsx file

In [None]:
pd.DataFrame({'Predicted_Price': test_predictions}).to_csv('best_pred.csv', index=False)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score
loaded_model = joblib.load('best_model.pkl')
y_val_pred = loaded_model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2 = r2_score(y_val, y_val_pred)

print("Best Model Performance:")
print("RMSE:", rmse)
print("R square Score (Accuracy):", r2)
