In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# preprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer


# regression model
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import VotingRegressor,StackingRegressor

# metrices
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error

In [4]:
# read dataset
df = pd.read_csv("./data/data.csv")
df.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,brand,name,price,spec_rating,processor,CPU,Ram,Ram_type,ROM,ROM_type,GPU,display_size,resolution_width,resolution_height,OS,warranty
0,0,0,HP,Victus 15-fb0157AX Gaming Laptop,49900,73.0,5th Gen AMD Ryzen 5 5600H,"Hexa Core, 12 Threads",8GB,DDR4,512GB,SSD,4GB AMD Radeon RX 6500M,15.6,1920.0,1080.0,Windows 11 OS,1
1,1,1,HP,15s-fq5007TU Laptop,39900,60.0,12th Gen Intel Core i3 1215U,"Hexa Core (2P + 4E), 8 Threads",8GB,DDR4,512GB,SSD,Intel UHD Graphics,15.6,1920.0,1080.0,Windows 11 OS,1


## **Y Data Profiling**

In [5]:
!pip install ydata-profiling




Collecting ydata-profiling
  Using cached ydata_profiling-4.18.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting matplotlib<=3.10,>=3.5 (from ydata-profiling)
  Using cached matplotlib-3.10.0-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling)
  Using cached visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting minify-html>=0.15.0 (from ydata-profiling)
  Using cached minify_html-0.18.1-cp313-cp313-win_amd64.whl.metadata (18 kB)
Collecting filetype>=1.0.0 (from ydata-profiling)
  Using cached filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting phik<0.13,>=0.12.5 (from ydata-profiling)
  Using cached phik-0.12.5-cp313-cp313-win_amd64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Using cached multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting typeguard<5,>=4 (from ydata-profiling)
  Using cached typeguard-4.4.4-py3-none-any.whl.metadata (3.

In [6]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title = "Laptop Price Predictor EDA")
profile.to_file("yData_report")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 18/18 [00:00<00:00, 62.78it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# **Basic adjustments**

In [7]:
df.shape

(893, 18)

In [8]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'brand', 'name', 'price', 'spec_rating',
       'processor', 'CPU', 'Ram', 'Ram_type', 'ROM', 'ROM_type', 'GPU',
       'display_size', 'resolution_width', 'resolution_height', 'OS',
       'warranty'],
      dtype='object')

In [9]:
len(df)

893

In [10]:
# drop column
df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], errors='ignore', inplace=True)
df.columns.size

16

# **Correlation for Numerical Value**

In [11]:
corr_target = df.select_dtypes(include=np.number).corr()["price"].sort_values(ascending=False)
print(corr_target)

price                1.000000
resolution_height    0.604748
resolution_width     0.586042
spec_rating          0.546391
display_size         0.233815
warranty             0.117101
Name: price, dtype: float64


# **Separate X and y**

In [12]:
X = df.drop("price",axis=1)
y=df['price']

# **Numerical Column and Categrorical Columns**

In [13]:
numeric_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(exclude=np.number).columns

In [14]:
numeric_features

Index(['spec_rating', 'display_size', 'resolution_width', 'resolution_height',
       'warranty'],
      dtype='object')

In [15]:
categorical_features

Index(['brand', 'name', 'processor', 'CPU', 'Ram', 'Ram_type', 'ROM',
       'ROM_type', 'GPU', 'OS'],
      dtype='object')

# **Preprocessing Pipeline**


*   For Numerical
*   For Categorical
*   Combine both as preprocessor



In [16]:
# numarical transformer
num_transformer = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

# categorical transformer
cat_transformer = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy="most_frequent")),
        ('encoder',OneHotEncoder(handle_unknown='ignore'))
    ]
)

# combine transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num',num_transformer, numeric_features),
        ('cat',cat_transformer,categorical_features)
    ]
)

In [17]:
# split them
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# **Baseline Model**

In [18]:
# define base model


base_models=[
    ('lr',LinearRegression()),
    ('rf',RandomForestRegressor(n_estimators=100,random_state=42)),
    ('gb',GradientBoostingRegressor(n_estimators=100,random_state=42)),
    ('dt',DecisionTreeRegressor(max_depth=5,random_state=42))
]

# define voting regressor
voting_model = VotingRegressor(estimators=base_models)

# define meta model
meta_model = Ridge()

# define stacking regressor
stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model
    )

# **Model Training**

In [19]:
# gathered all model
model_to_train = {
    'Linear Regression': base_models[0][1],
    'Random Forest': base_models[1][1],
    'Gradient Boosting': base_models[2][1],
    'Decision Tree': base_models[3][1],
    "Voting Ensemble": voting_model,
    "Stacking Ensemble": stacking_model,
}

result = []
for name,model in model_to_train.items():
  pipe = Pipeline(
      [
          ('preprocessor',preprocessor),
          ('model',model)
      ]
  )

  # train
  pipe.fit(X_train,y_train)

  # predict
  y_predict = pipe.predict(X_test)

  # evaluate models
  r2= r2_score(y_test,y_predict)
  rmse=np.sqrt(mean_squared_error(y_test,y_predict))
  mae= mean_absolute_error(y_test,y_predict)


  result.append({
      "Model":name,
      "R2 Score":r2,
      "RMSE":rmse,
      "mae":mae
  })

result_df = pd.DataFrame(result).sort_values("R2 Score",ascending=False)
print(result_df)


               Model  R2 Score          RMSE           mae
5  Stacking Ensemble  0.874246  20758.442526  12989.583273
0  Linear Regression  0.854830  22303.514786  14177.335257
1      Random Forest  0.816059  25105.764002  13331.777786
2  Gradient Boosting  0.799259  26227.246911  15236.329036
4    Voting Ensemble  0.794340  26546.649314  14284.603944
3      Decision Tree  0.515028  40765.497093  23438.340653


# **Visualization**

In [20]:
best_model_name = result_df.iloc[0]["Model"]
best_model_obj = model_to_train[best_model_name]

#fit the best model
final_pipe = Pipeline([
    ('preprocessor',preprocessor),
    ('model',best_model_obj)
])

final_pipe.fit(X_train,y_train)
y_final_predict = final_pipe.predict(X_test)

# plot actural vs predicted
plt.figure(figsize=(8,6))

sns.scatterplot(x=y_test,y=y_final_predict,alpha=0.6, color = "black")
min_val = min(y_test.min(), y_final_predict.min())
max_val = max(y_test.max(), y_final_predict.max())
plt.plot([min_val, max_val], [min_val, max_val], color="red", linestyle="--")

plt.xlabel("Actual Laptop Price")
plt.ylabel("Predicted Laptop Price")
plt.title("Actual vs Predicted Laptop Prices")
plt.grid(True)
plt.show()


  plt.show()


# **Cross-validation for stacking_model**

In [None]:
from sklearn.model_selection import cross_val_score

stacking_pipeline = Pipeline(
    [
         ('preprocessor',preprocessor),
    ('model',stacking_model)
    ]
)

cv_scores = cross_val_score(
    stacking_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

stacking_rmse = np.sqrt(-cv_scores)

print("\nStacking Ensemble RMSE per fold:", stacking_rmse)
print("Stacking Ensemble Mean RMSE:", stacking_rmse.mean())
print("Stacking Ensemble Std RMSE:", stacking_rmse.std())

# **Grid Search CV**

In [None]:
from sklearn.model_selection import GridSearchCV

# inside stacking for models
param_grid = {
    # random forest
    'model__rf__n_estimators': [100, 200],
    'model__rf__max_depth': [None, 10, 20],

    # gradient boosting
    'model__gb__n_estimators': [100, 200],
    'model__gb__learning_rate': [0.05, 0.1, 0.2],

    # decision tree
    'model__dt__max_depth': [3, 5, 7],

    # meta model (Ridge) hyperparameter
    'model__final_estimator__alpha': [0.1, 1.0, 10.0]
}


grid_search = GridSearchCV(
    estimator=stacking_pipeline,
    param_grid=param_grid,
    cv=2,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)


print("Best RMSE:", -grid_search.best_score_)
print("Best Hyperparameters:", grid_search.best_params_)


# **Randomized CV**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

param_dist = {
    'model__rf__n_estimators': randint(100, 300),
    'model__rf__max_depth': [None, 10, 20],
    'model__gb__n_estimators': randint(100, 200),
    'model__gb__learning_rate': uniform(0.05, 0.15),
    'model__dt__max_depth': randint(3, 7),
    'model__final_estimator__alpha': uniform(0.1, 10.0)
}

random_search = RandomizedSearchCV(
    estimator=stacking_pipeline,
    param_distributions=param_dist,
    n_iter=20,                 # number of random combinations to try
    cv=2,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2,
    random_state=42
)


random_search.fit(X_train, y_train)

print("Best RMSE:", -random_search.best_score_)
print("Best Hyperparameters:", random_search.best_params_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])


param_dist = {
    'model__n_estimators': randint(100, 400),
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': randint(2, 10)
}

random_search = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best RMSE:", -random_search.best_score_)
print("Best Hyperparameters:", random_search.best_params_)


# **Save Model**

In [None]:
import pickle

filename = "random_forest_model.pkl"

with open( filename, "wb" ) as file:
  pickle.dump( random_search, file )

with open( "/content/random_forest_model.pkl", "rb" ) as file:
  rf_loaded_model = pickle.load(file)


rf_loaded_model.predict(X_test)

# **ML Flow**

In [None]:
!pip install mlflow

In [None]:
import mlflow
import mlflow.sklearn

mlflow.set_experiment("Student performance using rf")

my_params = {
    'n_estimators' :100,
    'max_depth' :10,
    'random_state' : 42
}

simple_rf_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('model',RandomForestRegressor(**my_params))
])


#start mlflow

with mlflow.start_run( run_name = "Single_rf"):

  mlflow.log_params(my_params)

  mlflow.log_param("model_type","RandomForestRegressor")

  #train
  simple_rf_pipeline.fit(X_train,y_train)

  y_train_pred = simple_rf_pipeline.predict(X_train)
  train_rmse = np.sqrt(mean_squared_error(y_train,y_train_pred))


  #log train details
  mlflow.log_metric("train rmse",train_rmse)


  #test
  y_test_pred = simple_rf_pipeline.predict(X_test)
  test_rmse = np.sqrt(mean_squared_error(y_test,y_test_pred))

  mlflow.log_metric("test_rmse",test_rmse)





