In [1]:
import copy

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [2]:
data = pd.read_csv('../input/laptop-price/laptop_price.csv', encoding = 'latin-1')

loading the dataset into a dataframe

In [3]:
data.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [4]:
data.columns

Index(['laptop_ID', 'Company', 'Product', 'TypeName', 'Inches',
       'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight',
       'Price_euros'],
      dtype='object')

These are the columns in the dataframe. Here, Price_euros is our target variable. for training, these columns will be used:
1. company
2. Inches
3. ScreenResolution
4. Cpu
5. Ram
6. Memory
7. Gpu
8. Opsys
9. Weight

Moreover, we will be using one hot encoding technique for transforming the textual data into numerical data, so we can feed it into our RandomForestRegressor. In following steps, encodings will be generated of the above mentioned columns.

In [5]:
laptops = list(data.Company.unique())
laptop_ids = dict(zip(laptops,[i for i in range(len((laptops)))]))
print(laptop_ids)

{'Apple': 0, 'HP': 1, 'Acer': 2, 'Asus': 3, 'Dell': 4, 'Lenovo': 5, 'Chuwi': 6, 'MSI': 7, 'Microsoft': 8, 'Toshiba': 9, 'Huawei': 10, 'Xiaomi': 11, 'Vero': 12, 'Razer': 13, 'Mediacom': 14, 'Samsung': 15, 'Google': 16, 'Fujitsu': 17, 'LG': 18}


In [6]:
Inches = list(data.Inches.unique())
Inches_ids = dict(zip(Inches,[i for i in range(len((Inches)))]))
print(Inches_ids)

{13.3: 0, 15.6: 1, 15.4: 2, 14.0: 3, 12.0: 4, 11.6: 5, 17.3: 6, 10.1: 7, 13.5: 8, 12.5: 9, 13.0: 10, 18.4: 11, 13.9: 12, 12.3: 13, 17.0: 14, 15.0: 15, 14.1: 16, 11.3: 17}


In [7]:
screen_resolution = data.ScreenResolution.unique()
resolution_ids = dict(zip(screen_resolution,[i for i in range(len((screen_resolution)))]))
print(resolution_ids)

{'IPS Panel Retina Display 2560x1600': 0, '1440x900': 1, 'Full HD 1920x1080': 2, 'IPS Panel Retina Display 2880x1800': 3, '1366x768': 4, 'IPS Panel Full HD 1920x1080': 5, 'IPS Panel Retina Display 2304x1440': 6, 'IPS Panel Full HD / Touchscreen 1920x1080': 7, 'Full HD / Touchscreen 1920x1080': 8, 'Touchscreen / Quad HD+ 3200x1800': 9, 'IPS Panel Touchscreen 1920x1200': 10, 'Touchscreen 2256x1504': 11, 'Quad HD+ / Touchscreen 3200x1800': 12, 'IPS Panel 1366x768': 13, 'IPS Panel 4K Ultra HD / Touchscreen 3840x2160': 14, 'IPS Panel Full HD 2160x1440': 15, '4K Ultra HD / Touchscreen 3840x2160': 16, 'Touchscreen 2560x1440': 17, '1600x900': 18, 'IPS Panel 4K Ultra HD 3840x2160': 19, '4K Ultra HD 3840x2160': 20, 'Touchscreen 1366x768': 21, 'IPS Panel Full HD 1366x768': 22, 'IPS Panel 2560x1440': 23, 'IPS Panel Full HD 2560x1440': 24, 'IPS Panel Retina Display 2736x1824': 25, 'Touchscreen 2400x1600': 26, '2560x1440': 27, 'IPS Panel Quad HD+ 2560x1440': 28, 'IPS Panel Quad HD+ 3200x1800': 29, '

In [8]:
cpu = data.Cpu.unique()
cpu_ids = dict(zip(cpu,[i for i in range(len((cpu)))]))
print(cpu_ids)

{'Intel Core i5 2.3GHz': 0, 'Intel Core i5 1.8GHz': 1, 'Intel Core i5 7200U 2.5GHz': 2, 'Intel Core i7 2.7GHz': 3, 'Intel Core i5 3.1GHz': 4, 'AMD A9-Series 9420 3GHz': 5, 'Intel Core i7 2.2GHz': 6, 'Intel Core i7 8550U 1.8GHz': 7, 'Intel Core i5 8250U 1.6GHz': 8, 'Intel Core i3 6006U 2GHz': 9, 'Intel Core i7 2.8GHz': 10, 'Intel Core M m3 1.2GHz': 11, 'Intel Core i7 7500U 2.7GHz': 12, 'Intel Core i7 2.9GHz': 13, 'Intel Core i3 7100U 2.4GHz': 14, 'Intel Atom x5-Z8350 1.44GHz': 15, 'Intel Core i5 7300HQ 2.5GHz': 16, 'AMD E-Series E2-9000e 1.5GHz': 17, 'Intel Core i5 1.6GHz': 18, 'Intel Core i7 8650U 1.9GHz': 19, 'Intel Atom x5-Z8300 1.44GHz': 20, 'AMD E-Series E2-6110 1.5GHz': 21, 'AMD A6-Series 9220 2.5GHz': 22, 'Intel Celeron Dual Core N3350 1.1GHz': 23, 'Intel Core i3 7130U 2.7GHz': 24, 'Intel Core i7 7700HQ 2.8GHz': 25, 'Intel Core i5 2.0GHz': 26, 'AMD Ryzen 1700 3GHz': 27, 'Intel Pentium Quad Core N4200 1.1GHz': 28, 'Intel Atom x5-Z8550 1.44GHz': 29, 'Intel Celeron Dual Core N3060 1

In [9]:
ram = data.Ram.unique()
ram_ids = dict(zip(ram,[i for i in range(len((ram)))]))
print(ram_ids)

{'8GB': 0, '16GB': 1, '4GB': 2, '2GB': 3, '12GB': 4, '6GB': 5, '32GB': 6, '24GB': 7, '64GB': 8}


In [10]:
memory = data.Memory.unique()
memory_ids = dict(zip(memory,[i for i in range(len((memory)))]))
print(memory_ids)

{'128GB SSD': 0, '128GB Flash Storage': 1, '256GB SSD': 2, '512GB SSD': 3, '500GB HDD': 4, '256GB Flash Storage': 5, '1TB HDD': 6, '32GB Flash Storage': 7, '128GB SSD +  1TB HDD': 8, '256GB SSD +  256GB SSD': 9, '64GB Flash Storage': 10, '256GB SSD +  1TB HDD': 11, '256GB SSD +  2TB HDD': 12, '32GB SSD': 13, '2TB HDD': 14, '64GB SSD': 15, '1.0TB Hybrid': 16, '512GB SSD +  1TB HDD': 17, '1TB SSD': 18, '256GB SSD +  500GB HDD': 19, '128GB SSD +  2TB HDD': 20, '512GB SSD +  512GB SSD': 21, '16GB SSD': 22, '16GB Flash Storage': 23, '512GB SSD +  256GB SSD': 24, '512GB SSD +  2TB HDD': 25, '64GB Flash Storage +  1TB HDD': 26, '180GB SSD': 27, '1TB HDD +  1TB HDD': 28, '32GB HDD': 29, '1TB SSD +  1TB HDD': 30, '512GB Flash Storage': 31, '128GB HDD': 32, '240GB SSD': 33, '8GB SSD': 34, '508GB Hybrid': 35, '1.0TB HDD': 36, '512GB SSD +  1.0TB Hybrid': 37, '256GB SSD +  1.0TB Hybrid': 38}


In [11]:
gpu = data.Gpu.unique()
gpu_ids = dict(zip(gpu,[i for i in range(len((gpu)))]))
print(gpu_ids)

{'Intel Iris Plus Graphics 640': 0, 'Intel HD Graphics 6000': 1, 'Intel HD Graphics 620': 2, 'AMD Radeon Pro 455': 3, 'Intel Iris Plus Graphics 650': 4, 'AMD Radeon R5': 5, 'Intel Iris Pro Graphics': 6, 'Nvidia GeForce MX150': 7, 'Intel UHD Graphics 620': 8, 'Intel HD Graphics 520': 9, 'AMD Radeon Pro 555': 10, 'AMD Radeon R5 M430': 11, 'Intel HD Graphics 615': 12, 'AMD Radeon Pro 560': 13, 'Nvidia GeForce 940MX': 14, 'Intel HD Graphics 400': 15, 'Nvidia GeForce GTX 1050': 16, 'AMD Radeon R2': 17, 'AMD Radeon 530': 18, 'Nvidia GeForce 930MX': 19, 'Intel HD Graphics': 20, 'Intel HD Graphics 500': 21, 'Nvidia GeForce 930MX ': 22, 'Nvidia GeForce GTX 1060': 23, 'Nvidia GeForce 150MX': 24, 'Intel Iris Graphics 540': 25, 'AMD Radeon RX 580': 26, 'Nvidia GeForce 920MX': 27, 'AMD Radeon R4 Graphics': 28, 'AMD Radeon 520': 29, 'Nvidia GeForce GTX 1070': 30, 'Nvidia GeForce GTX 1050 Ti': 31, 'Nvidia GeForce MX130': 32, 'AMD R4 Graphics': 33, 'Nvidia GeForce GTX 940MX': 34, 'AMD Radeon RX 560': 

In [12]:
opsys = data.OpSys.unique()
opsys_ids = dict(zip(opsys,[i for i in range(len((opsys)))]))
print(opsys_ids)

{'macOS': 0, 'No OS': 1, 'Windows 10': 2, 'Mac OS X': 3, 'Linux': 4, 'Android': 5, 'Windows 10 S': 6, 'Chrome OS': 7, 'Windows 7': 8}


In [13]:
weight = data.Weight.unique()
weight_ids = dict(zip(weight,[i for i in range(len((weight)))]))
print(weight_ids)

{'1.37kg': 0, '1.34kg': 1, '1.86kg': 2, '1.83kg': 3, '2.1kg': 4, '2.04kg': 5, '1.3kg': 6, '1.6kg': 7, '2.2kg': 8, '0.92kg': 9, '1.22kg': 10, '0.98kg': 11, '2.5kg': 12, '1.62kg': 13, '1.91kg': 14, '2.3kg': 15, '1.35kg': 16, '1.88kg': 17, '1.89kg': 18, '1.65kg': 19, '2.71kg': 20, '1.2kg': 21, '1.44kg': 22, '2.8kg': 23, '2kg': 24, '2.65kg': 25, '2.77kg': 26, '3.2kg': 27, '0.69kg': 28, '1.49kg': 29, '2.4kg': 30, '2.13kg': 31, '2.43kg': 32, '1.7kg': 33, '1.4kg': 34, '1.8kg': 35, '1.9kg': 36, '3kg': 37, '1.252kg': 38, '2.7kg': 39, '2.02kg': 40, '1.63kg': 41, '1.96kg': 42, '1.21kg': 43, '2.45kg': 44, '1.25kg': 45, '1.5kg': 46, '2.62kg': 47, '1.38kg': 48, '1.58kg': 49, '1.85kg': 50, '1.23kg': 51, '1.26kg': 52, '2.16kg': 53, '2.36kg': 54, '2.05kg': 55, '1.32kg': 56, '1.75kg': 57, '0.97kg': 58, '2.9kg': 59, '2.56kg': 60, '1.48kg': 61, '1.74kg': 62, '1.1kg': 63, '1.56kg': 64, '2.03kg': 65, '1.05kg': 66, '4.4kg': 67, '1.90kg': 68, '1.29kg': 69, '2.0kg': 70, '1.95kg': 71, '2.06kg': 72, '1.12kg': 73

In [14]:
x = pd.DataFrame()

In [15]:
x['Company'] = data['Company'].map(laptop_ids)
x['Inches'] = data['Inches'].map(Inches_ids)
x['ScreenResolution'] = data['ScreenResolution'].map(resolution_ids)
x['Cpu'] = data['Cpu'].map(cpu_ids)
x['Ram'] = data['Ram'].map(ram_ids)
x['Memory'] = data['Memory'].map(memory_ids)
x['Gpu'] = data['Gpu'].map(gpu_ids)
x['OpSys'] = data['OpSys'].map(opsys_ids)
x['Weight'] = data['Weight'].map(weight_ids)
x['prices'] = data['Price_euros']

In [16]:
df_new = copy.deepcopy(x)

In [17]:
df_new.head()

Unnamed: 0,Company,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,prices
0,0,0,0,0,0,0,0,0,0,1339.69
1,0,0,1,1,0,1,1,0,1,898.94
2,1,1,2,2,0,2,2,1,2,575.0
3,0,2,3,3,1,3,3,0,3,2537.45
4,0,0,0,4,0,2,4,0,0,1803.6


**One hot encoded DataFrame**

In [18]:
train, validation, test = np.split(df_new.sample(frac=1), [int(.8*len(df_new)), int(.9*len(df_new))])

**Splitting the dataset into Test(80%), Train(10%) and validation(10%) dataset.**

In [19]:
x_columns = list(train.columns[:9])
y_columns = train.columns[9]

#train dataset
train_x = train[x_columns]
train_y = train[y_columns]

#test dataset
test_x = test[x_columns]
test_y = test[y_columns]

#validation dataset
validation_x = validation[x_columns]
validation_y = validation[y_columns]

In [20]:
regressor = RandomForestRegressor(n_estimators = 1000, random_state = 3)

In [21]:
regressor.fit(train_x, train_y.values.ravel())

RandomForestRegressor(n_estimators=1000, random_state=3)

In [22]:
regressor.score(train_x, train_y)

0.9643415397648385

Training accuracy

In [23]:
regressor.score(test_x, test_y)

0.8626161112284331

Testing accuracy

In [24]:
regressor.score(validation_x, validation_y)

0.8552656066789192

Validation accuracy

In [25]:
pred_input_x = list(validation_x.iloc[21])
pred_input_y = validation_y.iloc[21]

print(pred_input_x, pred_input_y)

[2, 3, 7, 66, 0, 2, 12, 2, 21] 1149.0


**Prediction using the trained Model**

In [26]:
np.array(pred_input_x).reshape(1,9)

array([[ 2,  3,  7, 66,  0,  2, 12,  2, 21]])

In [27]:
x = regressor.predict(np.array(pred_input_x).reshape(1,9))
print(x[0])

1201.439613333333


  "X does not have valid feature names, but"


**Predicted Value**