In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os
from google.colab import files

In [2]:
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

Saving laptop_data_cleaned.csv to laptop_data_cleaned.csv


In [3]:
df.head()

Unnamed: 0,Company,TypeName,Ram,Weight,Price,TouchScreen,Ips,Ppi,Cpu_brand,HDD,SSD,Gpu_brand,Os
0,Apple,Ultrabook,8,1.37,11.175755,0,1,226.983005,Intel Core i5,0,128,Intel,Mac
1,Apple,Ultrabook,8,1.34,10.776777,0,0,127.67794,Intel Core i5,0,0,Intel,Mac
2,HP,Notebook,8,1.86,10.329931,0,0,141.211998,Intel Core i5,0,256,Intel,Others
3,Apple,Ultrabook,16,1.83,11.814476,0,1,220.534624,Intel Core i7,0,512,AMD,Mac
4,Apple,Ultrabook,8,1.37,11.473101,0,1,226.983005,Intel Core i5,0,256,Intel,Mac


In [None]:
print("Dataset Information:")
df.info()

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1273 entries, 0 to 1272
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      1273 non-null   object 
 1   TypeName     1273 non-null   object 
 2   Ram          1273 non-null   int64  
 3   Weight       1273 non-null   float64
 4   Price        1273 non-null   float64
 5   TouchScreen  1273 non-null   int64  
 6   Ips          1273 non-null   int64  
 7   Ppi          1273 non-null   float64
 8   Cpu_brand    1273 non-null   object 
 9   HDD          1273 non-null   int64  
 10  SSD          1273 non-null   int64  
 11  Gpu_brand    1273 non-null   object 
 12  Os           1273 non-null   object 
dtypes: float64(3), int64(5), object(5)
memory usage: 129.4+ KB


In [4]:
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
Company        0
TypeName       0
Ram            0
Weight         0
Price          0
TouchScreen    0
Ips            0
Ppi            0
Cpu_brand      0
HDD            0
SSD            0
Gpu_brand      0
Os             0
dtype: int64


In [5]:
X = df.drop('Price', axis=1)
y = np.log1p(df['Price'])

In [6]:
categorical_features = ['Company', 'TypeName', 'Os', 'Cpu_brand', 'Gpu_brand']
numerical_features = ['Ram', 'Weight', 'TouchScreen', 'Ips', 'Ppi', 'HDD', 'SSD']

print("Categorical Features to be One-Hot Encoded:")
print(categorical_features)
print("\nNumerical Features to be Scaled:")
print(numerical_features)

Categorical Features to be One-Hot Encoded:
['Company', 'TypeName', 'Os', 'Cpu_brand', 'Gpu_brand']

Numerical Features to be Scaled:
['Ram', 'Weight', 'TouchScreen', 'Ips', 'Ppi', 'HDD', 'SSD']


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (1018, 12)
Testing data shape: (255, 12)


In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [9]:
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', LinearRegression())])

lr_pipeline.fit(X_train, y_train)

In [10]:
y_pred_lr = lr_pipeline.predict(X_test)

In [11]:
r2_lr = r2_score(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)

print(f"R-squared (R²): {r2_lr:.4f}")
print(f"Mean Squared Error (MSE): {mse_lr:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_lr:.4f}")
print(f"\nInterpretation: The model explains {r2_lr:.1%} of the variance in the log-transformed price.")

R-squared (R²): 0.8084
Mean Squared Error (MSE): 0.0006
Root Mean Squared Error (RMSE): 0.0235

Interpretation: The model explains 80.8% of the variance in the log-transformed price.


In [12]:
dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', DecisionTreeRegressor(random_state=42))])

dt_pipeline.fit(X_train, y_train)

In [13]:

y_pred_dt = dt_pipeline.predict(X_test)

r2_dt = r2_score(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mse_dt)

print(f"R-squared (R²): {r2_dt:.4f}")
print(f"Mean Squared Error (MSE): {mse_dt:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_dt:.4f}")

R-squared (R²): 0.8381
Mean Squared Error (MSE): 0.0005
Root Mean Squared Error (RMSE): 0.0216


In [28]:
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1))])

rf_pipeline.fit(X_train, y_train)

In [29]:


y_pred_rf = rf_pipeline.predict(X_test)


r2_rf = r2_score(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)

print(f"R-squared (R²): {r2_rf:.4f}")
print(f"Mean Squared Error (MSE): {mse_rf:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.4f}")

R-squared (R²): 0.8906
Mean Squared Error (MSE): 0.0003
Root Mean Squared Error (RMSE): 0.0177


In [31]:
model_performance = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest'],
    'R-squared (R²)': [r2_lr, r2_dt, r2_rf],
    'RMSE': [rmse_lr, rmse_dt, rmse_rf]
}).sort_values(by='R-squared (R²)', ascending=False)

print("--- Model Performance Comparison ---")
print(model_performance)

print("\n🏆 BEST MODEL SELECTION 🏆")
print("The Random Forest Regressor is the best model. It has the highest R-squared and the lowest RMSE,")
print("indicating it makes the most accurate predictions on unseen data.")

--- Model Performance Comparison ---
               Model  R-squared (R²)      RMSE
2      Random Forest        0.890579  0.017744
1      Decision Tree        0.838069  0.021585
0  Linear Regression        0.808417  0.023479

🏆 BEST MODEL SELECTION 🏆
The Random Forest Regressor is the best model. It has the highest R-squared and the lowest RMSE,
indicating it makes the most accurate predictions on unseen data.


In [32]:
best_model_pipeline = rf_pipeline

filename = 'laptop_price_predictor_model.joblib'

joblib.dump(best_model_pipeline, filename)

print(f"\n✅ Best model (Random Forest pipeline) has been saved as '{filename}'.")


✅ Best model (Random Forest pipeline) has been saved as 'laptop_price_predictor_model.joblib'.


In [34]:
loaded_model = joblib.load('laptop_price_predictor_model.joblib')
print("✅ Model loaded successfully.")

sample_laptop = pd.DataFrame([{
    'Company': 'Apple',
    'TypeName': 'Ultrabook',
    'Ram': 16,
    'Weight': 1.37,
    'TouchScreen': 0,
    'Ips': 1,
    'Ppi': 226.98,
    'Cpu_brand': 'Intel Core i5',
    'HDD': 0,
    'SSD': 512,
    'Gpu_brand': 'Intel',
    'Os': 'Mac'
}])
print("\nPredicting price for this laptop:")
print(sample_laptop)

predicted_log_price = loaded_model.predict(sample_laptop)

predicted_price = np.expm1(predicted_log_price)


print(f"Predicted Price: ${predicted_price[0]:,.2f}")

✅ Model loaded successfully.

Predicting price for this laptop:
  Company   TypeName  Ram  Weight  TouchScreen  Ips     Ppi      Cpu_brand  \
0   Apple  Ultrabook   16    1.37            0    1  226.98  Intel Core i5   

   HDD  SSD Gpu_brand   Os  
0    0  512     Intel  Mac  
Predicted Price: $11.76
