In [1]:
# Importing Libraries

import numpy as np
import pandas as pd


In [2]:
# Loading dataset

df = pd.read_csv('cleaned_laptop_details.csv')

df.head()

Unnamed: 0,Brand Name,OS,RAM Type,RAM Size,Processor,Warranty,Disk Type,Disk Size,Price
0,Lenovo,Windows,DDR4,8,Intel,2,SSD,256,36991
1,Lenovo,Windows,DDR4,8,Intel,2,SSD,512,39990
2,ASUS,Windows,DDR4,8,Intel,1,SSD,512,32990
3,HP,Windows,DDR4,8,AMD,1,SSD,512,49990
4,ASUS,Windows,DDR4,8,Intel,1,SSD,512,49990


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Brand Name  720 non-null    object
 1   OS          720 non-null    object
 2   RAM Type    720 non-null    object
 3   RAM Size    720 non-null    int64 
 4   Processor   720 non-null    object
 5   Warranty    720 non-null    int64 
 6   Disk Type   720 non-null    object
 7   Disk Size   720 non-null    int64 
 8   Price       720 non-null    int64 
dtypes: int64(4), object(5)
memory usage: 50.8+ KB


In [4]:
df.nunique()

Brand Name     10
OS              3
RAM Type        3
RAM Size        4
Processor       3
Warranty        2
Disk Type       2
Disk Size       4
Price         250
dtype: int64

In [5]:
def data_type(df):
    cat_var=[]
    num_var=[]
    for i in df.columns:
        if df[i].dtype=="O":
            print(i,": Object type")
            cat_var.append(i)
        else:
            print(i,": Number Type")
            num_var.append(i)
    df.drop_duplicates()
    return cat_var, num_var

In [6]:
cat_var , num_var = data_type(df)

Brand Name : Object type
OS : Object type
RAM Type : Object type
RAM Size : Number Type
Processor : Object type
Warranty : Number Type
Disk Type : Object type
Disk Size : Number Type
Price : Number Type


### Encoding

In [8]:
dt = {}
for feature in cat_var:
    labels_ordered=df.groupby([feature])['Price'].mean().sort_values().index
    labels_ordered={k:i for i, k in enumerate(labels_ordered,0)}
    dt[feature] = labels_ordered
    df[feature]=df[feature].map(labels_ordered)

### Modeling

In [9]:
X = df.drop(columns='Price')

y = df['Price']

In [10]:
X

Unnamed: 0,Brand Name,OS,RAM Type,RAM Size,Processor,Warranty,Disk Type,Disk Size
0,2,1,0,8,0,2,1,256
1,2,1,0,8,0,2,1,512
2,7,1,0,8,0,1,1,512
3,5,1,0,8,1,1,1,512
4,7,1,0,8,0,1,1,512
...,...,...,...,...,...,...,...,...
715,7,0,1,4,0,1,1,128
716,2,1,0,8,1,1,1,512
717,7,1,2,16,1,1,1,1024
718,7,1,1,16,0,1,1,512


In [11]:
y

0       36991
1       39990
2       32990
3       49990
4       49990
        ...  
715     23491
716     73491
717    194990
718    125991
719     50991
Name: Price, Length: 720, dtype: int64

### Train Test Split

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 50 )

In [13]:
X_train.head()

Unnamed: 0,Brand Name,OS,RAM Type,RAM Size,Processor,Warranty,Disk Type,Disk Size
364,7,1,1,16,0,1,1,1024
595,2,1,0,8,1,2,1,512
496,7,1,0,16,1,1,1,1024
263,0,1,1,8,0,1,1,256
533,7,1,2,16,1,1,1,1024


In [14]:
y_train

364    174991
595     35591
496    150991
263     29991
533    105990
        ...  
132     36991
289     43668
109     42991
480     36991
688    214791
Name: Price, Length: 576, dtype: int64

### Linear Regression

In [15]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()

linear_model.fit(X_train, y_train)

In [16]:
from sklearn.metrics import r2_score

y_pred = linear_model.predict(X_test)

print("R2 Score : ", r2_score(y_test, y_pred))

R2 Score :  0.8251038460151391


In [17]:
# Test R-square

linear_model.score(X_test , y_test)

0.8251038460151391

In [18]:
# Train R-square
linear_model.score(X_train , y_train)

0.835590999364322

### Random Forest Regressor

In [19]:
from sklearn.ensemble import RandomForestRegressor

random_model = RandomForestRegressor()

random_model.fit(X_train, y_train)

In [20]:
y_pred = random_model.predict(X_test)

print("R2 Score : ", r2_score(y_test, y_pred))

R2 Score :  0.8210681507248572


In [21]:
# Test R-square

random_model.score(X_test , y_test)

0.8210681507248572

In [22]:
# Train R-square
random_model.score(X_train , y_train)

0.9253924495206451

In [23]:
random_model.feature_importances_

array([0.05560254, 0.02564906, 0.20786173, 0.57832794, 0.03332734,
       0.0095073 , 0.00744093, 0.08228316])

In [24]:
pd.DataFrame(index=X.columns, data=random_model.feature_importances_, columns=['Feature Importance'])

Unnamed: 0,Feature Importance
Brand Name,0.055603
OS,0.025649
RAM Type,0.207862
RAM Size,0.578328
Processor,0.033327
Warranty,0.009507
Disk Type,0.007441
Disk Size,0.082283


### Hyperparameter Tuning

In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
# model
estimator = RandomForestRegressor()

In [27]:
# parameters

param_grid = {'n_estimators':list(range(1,25))}

In [28]:
grid = GridSearchCV(estimator, param_grid, cv=5)

In [29]:
grid.fit(X_train, y_train)

In [30]:
grid.best_params_

{'n_estimators': 20}

In [31]:
# Remodeling

random_model = RandomForestRegressor(n_estimators = 20)

random_model.fit(X_train, y_train)

In [32]:
y_pred_test = random_model.predict(X_test)

y_pred_train = random_model.predict(X_train)

In [33]:
print(" Test R2 Score : ", r2_score(y_test, y_pred_test))

print("Train R2 Score : ", r2_score(y_train, y_pred_train))


 Test R2 Score :  0.8215148790585244
Train R2 Score :  0.9229495005792783


In [34]:
# Test R-square

random_model.score(X_test , y_test)

0.8215148790585244

In [35]:
# Train R-square

random_model.score(X_train , y_train)

0.9229495005792783

### Save a Model

In [36]:
import pickle 

pickle.dump(dt, open('dt.pkl','wb'))

pickle.dump(random_model, open('model.pkl','wb'))

