In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import re
import seaborn as sb
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [69]:
data = r'C:\Users\guill\OneDrive\Documents\Vishrut\PhD applications\Projects\Breast Cancer Classicification - PyTorch\laptopData.csv'
dataset = pd.read_csv(data)

In [70]:
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1273 non-null   float64
 1   Company           1273 non-null   object 
 2   TypeName          1273 non-null   object 
 3   Inches            1273 non-null   object 
 4   ScreenResolution  1273 non-null   object 
 5   Cpu               1273 non-null   object 
 6   Ram               1273 non-null   object 
 7   Memory            1273 non-null   object 
 8   Gpu               1273 non-null   object 
 9   OpSys             1273 non-null   object 
 10  Weight            1273 non-null   object 
 11  Price             1273 non-null   float64
dtypes: float64(2), object(10)
memory usage: 122.3+ KB


In [71]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


Here, unnamed in a column that quite evidently doesn't contribute to the pricing. Should be removed (dropped)

In [72]:
dataset.drop(columns=['Unnamed: 0'],inplace=True)

In [73]:
dataset.head()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [74]:
print(len(dataset['Cpu'].unique()), "unique entries just in this column, that could be the result of just combinations to two separate classes of information, each containing less diversity, but creating more when combined")

119 unique entries just in this column, that could be the result of just combinations to two separate classes of information, each containing less diversity, but creating more when combined


Now it is pretty clear that a bunch of columns are noisy, and in need of feature engineering and preprocessing.

In [75]:
missing_values = dataset.isnull().sum()
print("Missing values per column:")
print(missing_values)

missing_percentage = dataset.isnull().mean() * 100 # Check the percentage of missing values in each column
print("Percentage of missing values per column:")
print(missing_percentage)

Missing values per column:
Company             30
TypeName            30
Inches              30
ScreenResolution    30
Cpu                 30
Ram                 30
Memory              30
Gpu                 30
OpSys               30
Weight              30
Price               30
dtype: int64
Percentage of missing values per column:
Company             2.302379
TypeName            2.302379
Inches              2.302379
ScreenResolution    2.302379
Cpu                 2.302379
Ram                 2.302379
Memory              2.302379
Gpu                 2.302379
OpSys               2.302379
Weight              2.302379
Price               2.302379
dtype: float64


We also see that the dataset has missing values. But interestingly, all the columns have the same number of missing value, that suggests that perhaps the missing values are occuring in related places and are not completely randomly distributed.

A little exploration of the dataset has revealed that indeed the missing values are actually missing rows, and in that case, the cleaning procedure I would adopt is to simply drop those rows. There is not risk of losing addtional information.

In [76]:
dataset.dropna(axis=0, inplace = True)

In [77]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1273 entries, 0 to 1302
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           1273 non-null   object 
 1   TypeName          1273 non-null   object 
 2   Inches            1273 non-null   object 
 3   ScreenResolution  1273 non-null   object 
 4   Cpu               1273 non-null   object 
 5   Ram               1273 non-null   object 
 6   Memory            1273 non-null   object 
 7   Gpu               1273 non-null   object 
 8   OpSys             1273 non-null   object 
 9   Weight            1273 non-null   object 
 10  Price             1273 non-null   float64
dtypes: float64(1), object(10)
memory usage: 119.3+ KB


In [78]:
print(dataset.isnull().sum())

Company             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price               0
dtype: int64


In [79]:
dataset.head()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


The occurrance of units in columns like weight does not add any additional meaning to the dataset. Another similar example is 'GB'.  

In [80]:
dataset['Ram'] = dataset['Ram'].str.replace("GB", "")
dataset['Weight'] = dataset['Weight'].str.replace("kg", "")


In [81]:
dataset.head()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.336
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.808


In [82]:
print(len(dataset['ScreenResolution'].unique()))
print(dataset['ScreenResolution'].unique())

40
['IPS Panel Retina Display 2560x1600' '1440x900' 'Full HD 1920x1080'
 'IPS Panel Retina Display 2880x1800' '1366x768'
 'IPS Panel Full HD 1920x1080' 'IPS Panel Retina Display 2304x1440'
 'IPS Panel Full HD / Touchscreen 1920x1080'
 'Full HD / Touchscreen 1920x1080' 'Touchscreen / Quad HD+ 3200x1800'
 'Touchscreen 2256x1504' 'Quad HD+ / Touchscreen 3200x1800'
 'IPS Panel 1366x768' 'IPS Panel 4K Ultra HD / Touchscreen 3840x2160'
 'IPS Panel Full HD 2160x1440' '4K Ultra HD / Touchscreen 3840x2160'
 '1600x900' 'IPS Panel 4K Ultra HD 3840x2160' '4K Ultra HD 3840x2160'
 'Touchscreen 1366x768' 'Touchscreen 2560x1440'
 'IPS Panel Full HD 1366x768' 'IPS Panel 2560x1440'
 'IPS Panel Full HD 2560x1440' 'IPS Panel Retina Display 2736x1824'
 'Touchscreen 2400x1600' '2560x1440' 'IPS Panel Quad HD+ 2560x1440'
 'IPS Panel Quad HD+ 3200x1800'
 'IPS Panel Quad HD+ / Touchscreen 3200x1800'
 'IPS Panel Touchscreen 1366x768' '1920x1080'
 'IPS Panel Full HD 1920x1200'
 'IPS Panel Touchscreen / 4K Ultra H

### Screen Resolution: Cleaning and Simplification

The "Screen Resolution" column contains noisy and inconsistent data. To organize it effectively, we can extract and categorize the key pieces of information:

- **Panel Type**: Examples include *IPS Panel* and *Touchscreen*.
- **Resolution**: Common formats include *1920x1080* and *2560x1600*.
- **Additional Features**: Such as *Retina Display* and *4K Ultra HD*.


In [83]:
# Function to simplify screen resolution
res = dataset['ScreenResolution']
def simplify_resolution(res):
    # Extract panel type (e.g., IPS, Touchscreen)
    panel = re.search(r'(IPS Panel|Touchscreen)', res)
    panel = panel.group(0) if panel else 'Standard'
    
    # Extract resolution (e.g., 1920x1080)
    resolution = re.search(r'\d{3,4}x\d{3,4}', res)
    resolution = resolution.group(0) if resolution else 'Unknown'
    
    # Extract additional features (e.g., Retina, 4K)
    feature = re.search(r'(Retina Display|4K Ultra HD|Full HD|Quad HD\+)', res)
    feature = feature.group(0) if feature else 'Standard'

    return f'{panel}, {feature}, {resolution}'

# Apply the function
dataset['SimplifiedResolution'] = dataset['ScreenResolution'].apply(simplify_resolution)
dataset.drop(columns=['ScreenResolution'],inplace=True)

In [84]:
dataset.head()

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,SimplifiedResolution
0,Apple,Ultrabook,13.3,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,"IPS Panel, Retina Display, 2560x1600"
1,Apple,Ultrabook,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232,"Standard, Standard, 1440x900"
2,HP,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0,"Standard, Full HD, 1920x1080"
3,Apple,Ultrabook,15.4,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.336,"IPS Panel, Retina Display, 2880x1800"
4,Apple,Ultrabook,13.3,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.808,"IPS Panel, Retina Display, 2560x1600"


Now that the ScreenResolution column has been simplified with the use of comma separated values, it would be easier to separate these three informations into three separate columns for further simplification.

In [85]:
dataset[['Screen Panel Type', 'Additional Screen Features', 'Screen Resolution']] = dataset['SimplifiedResolution'].str.split(', ', expand=True)
dataset.drop(columns=['SimplifiedResolution'], inplace=True)

In [86]:
dataset.head()

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,Screen Panel Type,Additional Screen Features,Screen Resolution
0,Apple,Ultrabook,13.3,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,IPS Panel,Retina Display,2560x1600
1,Apple,Ultrabook,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232,Standard,Standard,1440x900
2,HP,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0,Standard,Full HD,1920x1080
3,Apple,Ultrabook,15.4,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.336,IPS Panel,Retina Display,2880x1800
4,Apple,Ultrabook,13.3,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.808,IPS Panel,Retina Display,2560x1600


The same feature engineering has to be applied to some other columns, notably the **memory**, **cpu** and **gpu** columns

Let start with cpu.

In [87]:
print(len(dataset['Cpu'].unique()))
dataset['Cpu'].unique()

118


array(['Intel Core i5 2.3GHz', 'Intel Core i5 1.8GHz',
       'Intel Core i5 7200U 2.5GHz', 'Intel Core i7 2.7GHz',
       'Intel Core i5 3.1GHz', 'AMD A9-Series 9420 3GHz',
       'Intel Core i7 2.2GHz', 'Intel Core i7 8550U 1.8GHz',
       'Intel Core i5 8250U 1.6GHz', 'Intel Core i3 6006U 2GHz',
       'Intel Core i7 2.8GHz', 'Intel Core M m3 1.2GHz',
       'Intel Core i7 7500U 2.7GHz', 'Intel Core i7 2.9GHz',
       'Intel Core i3 7100U 2.4GHz', 'Intel Core i5 7300HQ 2.5GHz',
       'AMD E-Series E2-9000e 1.5GHz', 'Intel Core i5 1.6GHz',
       'Intel Core i7 8650U 1.9GHz', 'Intel Atom x5-Z8300 1.44GHz',
       'AMD E-Series E2-6110 1.5GHz', 'AMD A6-Series 9220 2.5GHz',
       'Intel Celeron Dual Core N3350 1.1GHz',
       'Intel Core i3 7130U 2.7GHz', 'Intel Core i7 7700HQ 2.8GHz',
       'Intel Core i5 2.0GHz', 'AMD Ryzen 1700 3GHz',
       'Intel Pentium Quad Core N4200 1.1GHz',
       'Intel Celeron Dual Core N3060 1.6GHz', 'Intel Core i5 1.3GHz',
       'AMD FX 9830P 3GHz', '

Now the dataset does not have missing values in any column.

In [88]:
dataset['Brand'] = dataset['Cpu'].apply(lambda x: x.split()[0])  # First word
dataset['Series'] = dataset['Cpu'].apply(lambda x: x.split()[1] if len(x.split()) > 1 else None)  # Second word
dataset['Core Type'] = dataset['Cpu'].str.extract(r'(\b(?:Quad|Dual|Octa)?\b Core)', expand=False)  # Core type
dataset['Model Number'] = dataset['Cpu'].str.extract(r'(\b[A-Za-z0-9\-]+[0-9]+\b)', expand=False)  # Model number
dataset['Clock Speed'] = dataset['Cpu'].str.extract(r'(\d+\.\d+GHz)', expand=False)  # Clock speed

In [89]:
dataset.head()

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,Screen Panel Type,Additional Screen Features,Screen Resolution,Brand,Series,Core Type,Model Number,Clock Speed
0,Apple,Ultrabook,13.3,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,IPS Panel,Retina Display,2560x1600,Intel,Core,Core,i5,2.3GHz
1,Apple,Ultrabook,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232,Standard,Standard,1440x900,Intel,Core,Core,i5,1.8GHz
2,HP,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0,Standard,Full HD,1920x1080,Intel,Core,Core,i5,2.5GHz
3,Apple,Ultrabook,15.4,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.336,IPS Panel,Retina Display,2880x1800,Intel,Core,Core,i7,2.7GHz
4,Apple,Ultrabook,13.3,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.808,IPS Panel,Retina Display,2560x1600,Intel,Core,Core,i5,3.1GHz


In [90]:
dataset.drop(columns=['Cpu'], inplace=True)

In [None]:
dataset.head()

Unnamed: 0,Company,TypeName,Inches,Ram,Memory,Gpu,OpSys,Weight,Price,Screen Panel Type,Additional Screen Features,Screen Resolution,Brand,Series,Core Type,Model Number,Clock Speed
0,Apple,Ultrabook,13.3,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,IPS Panel,Retina Display,2560x1600,Intel,Core,Core,i5,2.3GHz
1,Apple,Ultrabook,13.3,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232,Standard,Standard,1440x900,Intel,Core,Core,i5,1.8GHz
2,HP,Notebook,15.6,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0,Standard,Full HD,1920x1080,Intel,Core,Core,i5,2.5GHz
3,Apple,Ultrabook,15.4,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.336,IPS Panel,Retina Display,2880x1800,Intel,Core,Core,i7,2.7GHz
4,Apple,Ultrabook,13.3,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.808,IPS Panel,Retina Display,2560x1600,Intel,Core,Core,i5,3.1GHz


Things are starting to look cleaner. Let's now rename some columns to for more precise information. 

In [None]:
dataset = dataset.rename(columns={'Brand': 'CPU Brand'})

In [91]:
dataset = dataset.rename(columns={'Series': 'CPU Series'})
dataset = dataset.rename(columns={'Core Type': 'CPU Core Type'})
dataset = dataset.rename(columns={'Clock Speed': 'CPU Clock Speed'})

In [92]:
dataset = dataset.rename(columns={'Model Number': 'CPU Model Number'})


In [93]:
dataset.head()

Unnamed: 0,Company,TypeName,Inches,Ram,Memory,Gpu,OpSys,Weight,Price,Screen Panel Type,Additional Screen Features,Screen Resolution,Brand,CPU Series,CPU Core Type,CPU Model Number,CPU Clock Speed
0,Apple,Ultrabook,13.3,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,IPS Panel,Retina Display,2560x1600,Intel,Core,Core,i5,2.3GHz
1,Apple,Ultrabook,13.3,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232,Standard,Standard,1440x900,Intel,Core,Core,i5,1.8GHz
2,HP,Notebook,15.6,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0,Standard,Full HD,1920x1080,Intel,Core,Core,i5,2.5GHz
3,Apple,Ultrabook,15.4,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.336,IPS Panel,Retina Display,2880x1800,Intel,Core,Core,i7,2.7GHz
4,Apple,Ultrabook,13.3,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.808,IPS Panel,Retina Display,2560x1600,Intel,Core,Core,i5,3.1GHz


The next column that we need to feature engineer is the **gpu**. First let's take a look at the set of unique entries that this column has.

In [94]:
print(len(dataset['Gpu'].unique()), "unique entries to this column")
print(dataset['Gpu'].unique())

110 unique entries to this column
['Intel Iris Plus Graphics 640' 'Intel HD Graphics 6000'
 'Intel HD Graphics 620' 'AMD Radeon Pro 455'
 'Intel Iris Plus Graphics 650' 'AMD Radeon R5' 'Intel Iris Pro Graphics'
 'Nvidia GeForce MX150' 'Intel UHD Graphics 620' 'Intel HD Graphics 520'
 'AMD Radeon Pro 555' 'AMD Radeon R5 M430' 'Intel HD Graphics 615'
 'AMD Radeon Pro 560' 'Nvidia GeForce 940MX' 'Nvidia GeForce GTX 1050'
 'AMD Radeon R2' 'AMD Radeon 530' 'Nvidia GeForce 930MX'
 'Intel HD Graphics' 'Intel HD Graphics 500' 'Nvidia GeForce 930MX '
 'Nvidia GeForce GTX 1060' 'Nvidia GeForce 150MX'
 'Intel Iris Graphics 540' 'AMD Radeon RX 580' 'Nvidia GeForce 920MX'
 'AMD Radeon R4 Graphics' 'AMD Radeon 520' 'Nvidia GeForce GTX 1070'
 'Nvidia GeForce GTX 1050 Ti' 'Intel HD Graphics 400'
 'Nvidia GeForce MX130' 'AMD R4 Graphics' 'Nvidia GeForce GTX 940MX'
 'AMD Radeon RX 560' 'Nvidia GeForce 920M' 'AMD Radeon R7 M445'
 'AMD Radeon RX 550' 'Nvidia GeForce GTX 1050M' 'Intel HD Graphics 515'
 'AM

In [95]:
dataset['Gpu Brand'] = dataset['Gpu'].apply(lambda x: x.split()[0])
dataset['Gpu Series'] = dataset['Gpu'].apply(lambda x: ' '.join(x.split()[1:]) if len(x.split()) > 1 else None)
dataset['Gpu Type'] = dataset['Gpu'].str.extract(r'\b(GeForce|Quadro|Iris|Radeon|FirePro|HD Graphics)\b', expand=False)

print(dataset[['Gpu', 'Gpu Brand', 'Gpu Series', 'Gpu Type']].head())

                            Gpu Gpu Brand              Gpu Series     Gpu Type
0  Intel Iris Plus Graphics 640     Intel  Iris Plus Graphics 640         Iris
1        Intel HD Graphics 6000     Intel        HD Graphics 6000  HD Graphics
2         Intel HD Graphics 620     Intel         HD Graphics 620  HD Graphics
3            AMD Radeon Pro 455       AMD          Radeon Pro 455       Radeon
4  Intel Iris Plus Graphics 650     Intel  Iris Plus Graphics 650         Iris


In [96]:
dataset.drop(columns=['Gpu'], inplace=True)


In [97]:
dataset.head()

Unnamed: 0,Company,TypeName,Inches,Ram,Memory,OpSys,Weight,Price,Screen Panel Type,Additional Screen Features,Screen Resolution,Brand,CPU Series,CPU Core Type,CPU Model Number,CPU Clock Speed,Gpu Brand,Gpu Series,Gpu Type
0,Apple,Ultrabook,13.3,8,128GB SSD,macOS,1.37,71378.6832,IPS Panel,Retina Display,2560x1600,Intel,Core,Core,i5,2.3GHz,Intel,Iris Plus Graphics 640,Iris
1,Apple,Ultrabook,13.3,8,128GB Flash Storage,macOS,1.34,47895.5232,Standard,Standard,1440x900,Intel,Core,Core,i5,1.8GHz,Intel,HD Graphics 6000,HD Graphics
2,HP,Notebook,15.6,8,256GB SSD,No OS,1.86,30636.0,Standard,Full HD,1920x1080,Intel,Core,Core,i5,2.5GHz,Intel,HD Graphics 620,HD Graphics
3,Apple,Ultrabook,15.4,16,512GB SSD,macOS,1.83,135195.336,IPS Panel,Retina Display,2880x1800,Intel,Core,Core,i7,2.7GHz,AMD,Radeon Pro 455,Radeon
4,Apple,Ultrabook,13.3,8,256GB SSD,macOS,1.37,96095.808,IPS Panel,Retina Display,2560x1600,Intel,Core,Core,i5,3.1GHz,Intel,Iris Plus Graphics 650,Iris


And lastly, I will treat another noisy column, namely, **memory**. 

In [98]:
print(len(dataset['Memory'].unique()))

40


In [99]:
print(dataset['Memory'].unique())

['128GB SSD' '128GB Flash Storage' '256GB SSD' '512GB SSD' '500GB HDD'
 '256GB Flash Storage' '1TB HDD' '128GB SSD +  1TB HDD'
 '256GB SSD +  256GB SSD' '64GB Flash Storage' '32GB Flash Storage'
 '256GB SSD +  1TB HDD' '256GB SSD +  2TB HDD' '32GB SSD' '2TB HDD'
 '64GB SSD' '1.0TB Hybrid' '512GB SSD +  1TB HDD' '1TB SSD'
 '256GB SSD +  500GB HDD' '128GB SSD +  2TB HDD' '512GB SSD +  512GB SSD'
 '16GB SSD' '16GB Flash Storage' '512GB SSD +  256GB SSD'
 '512GB SSD +  2TB HDD' '64GB Flash Storage +  1TB HDD' '180GB SSD'
 '1TB HDD +  1TB HDD' '32GB HDD' '1TB SSD +  1TB HDD' '?'
 '512GB Flash Storage' '128GB HDD' '240GB SSD' '8GB SSD' '508GB Hybrid'
 '1.0TB HDD' '512GB SSD +  1.0TB Hybrid' '256GB SSD +  1.0TB Hybrid']


In [100]:
dataset['Main Storage Size'] = dataset['Memory'].str.extract(r'(\d+GB|\d+\.\d+TB)')
dataset['Main Storage Type'] = dataset['Memory'].str.extract(r'(\bSSD\b|\bHDD\b|Flash Storage|Hybrid)')
dataset['Additional Storage Size'] = dataset['Memory'].str.extract(r'\+ *(\d+GB|\d+\.\d+TB)')
dataset['Additional Storage Type'] = dataset['Memory'].str.extract(r'\+ *\d+GB|\d+\.\d+TB *(\bSSD\b|\bHDD\b|Flash Storage|Hybrid)')

In [101]:
dataset.drop(columns=['Memory'], inplace=True)

In [102]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1273 entries, 0 to 1302
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Company                     1273 non-null   object 
 1   TypeName                    1273 non-null   object 
 2   Inches                      1273 non-null   object 
 3   Ram                         1273 non-null   object 
 4   OpSys                       1273 non-null   object 
 5   Weight                      1273 non-null   object 
 6   Price                       1273 non-null   float64
 7   Screen Panel Type           1273 non-null   object 
 8   Additional Screen Features  1273 non-null   object 
 9   Screen Resolution           1273 non-null   object 
 10  Brand                       1273 non-null   object 
 11  CPU Series                  1273 non-null   object 
 12  CPU Core Type               1195 non-null   object 
 13  CPU Model Number            1257 non-n

In [103]:
dataset.isnull().sum()

Company                          0
TypeName                         0
Inches                           0
Ram                              0
OpSys                            0
Weight                           0
Price                            0
Screen Panel Type                0
Additional Screen Features       0
Screen Resolution                0
Brand                            0
CPU Series                       0
CPU Core Type                   78
CPU Model Number                16
CPU Clock Speed                 84
Gpu Brand                        0
Gpu Series                       0
Gpu Type                        71
Main Storage Size              250
Main Storage Type                1
Additional Storage Size       1265
Additional Storage Type       1261
dtype: int64

In [104]:
dataset['Additional Storage Size'].fillna('0GB', inplace=True)  # Assume missing means no additional storage

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['Additional Storage Size'].fillna('0GB', inplace=True)  # Assume missing means no additional storage


In [105]:
dataset.isnull().sum()

Company                          0
TypeName                         0
Inches                           0
Ram                              0
OpSys                            0
Weight                           0
Price                            0
Screen Panel Type                0
Additional Screen Features       0
Screen Resolution                0
Brand                            0
CPU Series                       0
CPU Core Type                   78
CPU Model Number                16
CPU Clock Speed                 84
Gpu Brand                        0
Gpu Series                       0
Gpu Type                        71
Main Storage Size              250
Main Storage Type                1
Additional Storage Size          0
Additional Storage Type       1261
dtype: int64

We se that the **Addtional Storage Type** has mostly null values, adding very little information, globally speaking. So we can remove the column and still expect not to lose much information.

In [106]:
dataset.drop(columns=['Additional Storage Type'], inplace=True)

In [107]:
dataset.isnull().sum()

Company                         0
TypeName                        0
Inches                          0
Ram                             0
OpSys                           0
Weight                          0
Price                           0
Screen Panel Type               0
Additional Screen Features      0
Screen Resolution               0
Brand                           0
CPU Series                      0
CPU Core Type                  78
CPU Model Number               16
CPU Clock Speed                84
Gpu Brand                       0
Gpu Series                      0
Gpu Type                       71
Main Storage Size             250
Main Storage Type               1
Additional Storage Size         0
dtype: int64

In [108]:
dataset['CPU Clock Speed'] = dataset['CPU Clock Speed'].str.replace('GHz', '').astype(float)
dataset['CPU Clock Speed'].fillna(dataset['CPU Clock Speed'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['CPU Clock Speed'].fillna(dataset['CPU Clock Speed'].mean(), inplace=True)


In [109]:
dataset.isnull().sum()

Company                         0
TypeName                        0
Inches                          0
Ram                             0
OpSys                           0
Weight                          0
Price                           0
Screen Panel Type               0
Additional Screen Features      0
Screen Resolution               0
Brand                           0
CPU Series                      0
CPU Core Type                  78
CPU Model Number               16
CPU Clock Speed                 0
Gpu Brand                       0
Gpu Series                      0
Gpu Type                       71
Main Storage Size             250
Main Storage Type               1
Additional Storage Size         0
dtype: int64

In [124]:
dataset['CPU Core Type'].fillna(dataset['CPU Core Type'].mode()[0], inplace=True)


We replaced the NaN values in the CPU Core Type column with the most commonly occuring entry in the column (the mode). This retains some level of generality, although we can check afterwards if incorporating other strategies change the model accuracy in any significant way.

In [127]:
dataset.isnull().sum()

Company                         0
TypeName                        0
Inches                          0
Ram                             0
OpSys                           0
Weight                          0
Price                           0
Screen Panel Type               0
Additional Screen Features      0
Screen Resolution               0
Brand                           0
CPU Series                      0
CPU Core Type                   0
CPU Model Number               16
CPU Clock Speed                 0
Gpu Brand                       0
Gpu Series                      0
Gpu Type                       71
Main Storage Size             250
Main Storage Type               1
Additional Storage Size         0
dtype: int64

We can now move on to another column with null values. We see the CPU Model Number has only 16 instances of having null values, which suggests that removing this column should not alter the model accuracy by a lot, since not a lot of information will be lost.

In [134]:
dataset.dropna(subset=['CPU Model Number'], inplace=True)

In [135]:
dataset.isnull().sum()

Company                         0
TypeName                        0
Inches                          0
Ram                             0
OpSys                           0
Weight                          0
Price                           0
Screen Panel Type               0
Additional Screen Features      0
Screen Resolution               0
Brand                           0
CPU Series                      0
CPU Core Type                   0
CPU Model Number                0
CPU Clock Speed                 0
Gpu Brand                       0
Gpu Series                      0
Gpu Type                       71
Main Storage Size             249
Main Storage Type               1
Additional Storage Size         0
dtype: int64

We can also do the same to the **GPU Type** column, as we would lose just about 5% of the available data. 

In [140]:
dataset.dropna(subset=['Gpu Type'], inplace=True)

In [142]:
dataset.dropna(subset=['Main Storage Type'], inplace=True)

In [143]:
dataset.isnull().sum()

Company                         0
TypeName                        0
Inches                          0
Ram                             0
OpSys                           0
Weight                          0
Price                           0
Screen Panel Type               0
Additional Screen Features      0
Screen Resolution               0
Brand                           0
CPU Series                      0
CPU Core Type                   0
CPU Model Number                0
CPU Clock Speed                 0
Gpu Brand                       0
Gpu Series                      0
Gpu Type                        0
Main Storage Size             239
Main Storage Type               0
Additional Storage Size         0
dtype: int64

Main Storage is one of the most relevant information that a laptop can have. Entries with no information about the total storage space should safely be removed.

In [145]:
dataset.dropna(subset=['Main Storage Size'], inplace=True)

In [147]:
dataset.isnull().sum()

Company                       0
TypeName                      0
Inches                        0
Ram                           0
OpSys                         0
Weight                        0
Price                         0
Screen Panel Type             0
Additional Screen Features    0
Screen Resolution             0
Brand                         0
CPU Series                    0
CPU Core Type                 0
CPU Model Number              0
CPU Clock Speed               0
Gpu Brand                     0
Gpu Series                    0
Gpu Type                      0
Main Storage Size             0
Main Storage Type             0
Additional Storage Size       0
dtype: int64

With this, we have a dataset with no null values in any column.

In [152]:
dataset.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight,Price,Screen Panel Type,Additional Screen Features,Screen Resolution,...,CPU Series,CPU Core Type,CPU Model Number,CPU Clock Speed,Gpu Brand,Gpu Series,Gpu Type,Main Storage Size,Main Storage Type,Additional Storage Size
0,Apple,Ultrabook,13.3,8,macOS,1.37,71378.6832,IPS Panel,Retina Display,2560x1600,...,Core,Core,i5,2.3,Intel,Iris Plus Graphics 640,Iris,128GB,SSD,0GB
1,Apple,Ultrabook,13.3,8,macOS,1.34,47895.5232,Standard,Standard,1440x900,...,Core,Core,i5,1.8,Intel,HD Graphics 6000,HD Graphics,128GB,Flash Storage,0GB
2,HP,Notebook,15.6,8,No OS,1.86,30636.0,Standard,Full HD,1920x1080,...,Core,Core,i5,2.5,Intel,HD Graphics 620,HD Graphics,256GB,SSD,0GB
3,Apple,Ultrabook,15.4,16,macOS,1.83,135195.336,IPS Panel,Retina Display,2880x1800,...,Core,Core,i7,2.7,AMD,Radeon Pro 455,Radeon,512GB,SSD,0GB
4,Apple,Ultrabook,13.3,8,macOS,1.37,96095.808,IPS Panel,Retina Display,2560x1600,...,Core,Core,i5,3.1,Intel,Iris Plus Graphics 650,Iris,256GB,SSD,0GB


In [163]:
dataset['Price'] = dataset['Price'].astype(int)

In [162]:
dataset.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight,Price,Screen Panel Type,Additional Screen Features,Screen Resolution,...,CPU Series,CPU Core Type,CPU Model Number,CPU Clock Speed,Gpu Brand,Gpu Series,Gpu Type,Main Storage Size,Main Storage Type,Additional Storage Size
0,Apple,Ultrabook,13.3,8,macOS,1.37,71379,IPS Panel,Retina Display,2560x1600,...,Core,Core,i5,2.3,Intel,Iris Plus Graphics 640,Iris,128GB,SSD,0GB
1,Apple,Ultrabook,13.3,8,macOS,1.34,47896,Standard,Standard,1440x900,...,Core,Core,i5,1.8,Intel,HD Graphics 6000,HD Graphics,128GB,Flash Storage,0GB
2,HP,Notebook,15.6,8,No OS,1.86,30636,Standard,Full HD,1920x1080,...,Core,Core,i5,2.5,Intel,HD Graphics 620,HD Graphics,256GB,SSD,0GB
3,Apple,Ultrabook,15.4,16,macOS,1.83,135195,IPS Panel,Retina Display,2880x1800,...,Core,Core,i7,2.7,AMD,Radeon Pro 455,Radeon,512GB,SSD,0GB
4,Apple,Ultrabook,13.3,8,macOS,1.37,96096,IPS Panel,Retina Display,2560x1600,...,Core,Core,i5,3.1,Intel,Iris Plus Graphics 650,Iris,256GB,SSD,0GB


In [168]:
dataset.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight,Price,Screen Panel Type,Additional Screen Features,Screen Resolution,...,CPU Series,CPU Core Type,CPU Model Number,CPU Clock Speed,Gpu Brand,Gpu Series,Gpu Type,Main Storage Size,Main Storage Type,Additional Storage Size
0,Apple,Ultrabook,13.3,8,macOS,1.37,71379,IPS Panel,Retina Display,2560x1600,...,Core,Core,i5,2.3,Intel,Iris Plus Graphics 640,Iris,128GB,SSD,0GB
1,Apple,Ultrabook,13.3,8,macOS,1.34,47896,Standard,Standard,1440x900,...,Core,Core,i5,1.8,Intel,HD Graphics 6000,HD Graphics,128GB,Flash Storage,0GB
2,HP,Notebook,15.6,8,No OS,1.86,30636,Standard,Full HD,1920x1080,...,Core,Core,i5,2.5,Intel,HD Graphics 620,HD Graphics,256GB,SSD,0GB
3,Apple,Ultrabook,15.4,16,macOS,1.83,135195,IPS Panel,Retina Display,2880x1800,...,Core,Core,i7,2.7,AMD,Radeon Pro 455,Radeon,512GB,SSD,0GB
4,Apple,Ultrabook,13.3,8,macOS,1.37,96096,IPS Panel,Retina Display,2560x1600,...,Core,Core,i5,3.1,Intel,Iris Plus Graphics 650,Iris,256GB,SSD,0GB


In [171]:
print(dataset['Main Storage Size'].unique())
print(dataset['Additional Storage Size'].unique())


['128GB' '256GB' '512GB' '500GB' '64GB' '32GB' '1.0TB' '16GB' '180GB'
 '240GB' '8GB' '508GB']
['0GB' '500GB' '256GB' '1.0TB']


In [170]:
dataset['Main Storage Size'] = dataset['Main Storage Size'].str.replace('GB|TB', '').astype(float)
dataset['Additional Storage Size'] = dataset['Additional Storage Size'].str.replace('GB|TB', '').astype(float)

ValueError: could not convert string to float: '128GB'

There are some values in the columns consisting of main or additional storage sizes like 10.0GB which contain both string and float, which need to be made coherent first. We also have different units like GB and TB, which need to be made consistent.

In [172]:
#Function to make conistent units for size

def convert(size):
    if 'TB' in size:
        num_tb = float(size.replace('TB', ''))
        return num_tb * 1024
    elif 'GB' in size:
        return float(size.replace('GB', ''))
    else:
        return 0.0

In [175]:
dataset['Main Storage Size'] = dataset['Main Storage Size'].apply(convert)

In [177]:
dataset['Main Storage Size'] = dataset['Main Storage Size'].astype(int)

In [179]:
dataset['Main Storage Size'].unique()

array([ 128,  256,  512,  500,   64,   32, 1024,   16,  180,  240,    8,
        508])

In [183]:
dataset['Additional Storage Size'] = dataset['Additional Storage Size'].astype(int)
dataset['Additional Storage Size'].unique()

array([   0,  500,  256, 1024])

In [184]:
dataset.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight,Price,Screen Panel Type,Additional Screen Features,Screen Resolution,...,CPU Series,CPU Core Type,CPU Model Number,CPU Clock Speed,Gpu Brand,Gpu Series,Gpu Type,Main Storage Size,Main Storage Type,Additional Storage Size
0,Apple,Ultrabook,13.3,8,macOS,1.37,71379,IPS Panel,Retina Display,2560x1600,...,Core,Core,i5,2.3,Intel,Iris Plus Graphics 640,Iris,128,SSD,0
1,Apple,Ultrabook,13.3,8,macOS,1.34,47896,Standard,Standard,1440x900,...,Core,Core,i5,1.8,Intel,HD Graphics 6000,HD Graphics,128,Flash Storage,0
2,HP,Notebook,15.6,8,No OS,1.86,30636,Standard,Full HD,1920x1080,...,Core,Core,i5,2.5,Intel,HD Graphics 620,HD Graphics,256,SSD,0
3,Apple,Ultrabook,15.4,16,macOS,1.83,135195,IPS Panel,Retina Display,2880x1800,...,Core,Core,i7,2.7,AMD,Radeon Pro 455,Radeon,512,SSD,0
4,Apple,Ultrabook,13.3,8,macOS,1.37,96096,IPS Panel,Retina Display,2560x1600,...,Core,Core,i5,3.1,Intel,Iris Plus Graphics 650,Iris,256,SSD,0


In [185]:
dataset.drop(columns='CPU Core Type')

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight,Price,Screen Panel Type,Additional Screen Features,Screen Resolution,Brand,CPU Series,CPU Model Number,CPU Clock Speed,Gpu Brand,Gpu Series,Gpu Type,Main Storage Size,Main Storage Type,Additional Storage Size
0,Apple,Ultrabook,13.3,8,macOS,1.37,71379,IPS Panel,Retina Display,2560x1600,Intel,Core,i5,2.3,Intel,Iris Plus Graphics 640,Iris,128,SSD,0
1,Apple,Ultrabook,13.3,8,macOS,1.34,47896,Standard,Standard,1440x900,Intel,Core,i5,1.8,Intel,HD Graphics 6000,HD Graphics,128,Flash Storage,0
2,HP,Notebook,15.6,8,No OS,1.86,30636,Standard,Full HD,1920x1080,Intel,Core,i5,2.5,Intel,HD Graphics 620,HD Graphics,256,SSD,0
3,Apple,Ultrabook,15.4,16,macOS,1.83,135195,IPS Panel,Retina Display,2880x1800,Intel,Core,i7,2.7,AMD,Radeon Pro 455,Radeon,512,SSD,0
4,Apple,Ultrabook,13.3,8,macOS,1.37,96096,IPS Panel,Retina Display,2560x1600,Intel,Core,i5,3.1,Intel,Iris Plus Graphics 650,Iris,256,SSD,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,Asus,Notebook,15.6,4,Windows 10,2.2,38379,Standard,Standard,1366x768,Intel,Core,i7,2.5,Nvidia,GeForce 920M,GeForce,500,HDD,0
1298,Lenovo,2 in 1 Convertible,14,4,Windows 10,1.8,33993,IPS Panel,Full HD,1920x1080,Intel,Core,i7,2.5,Intel,HD Graphics 520,HD Graphics,128,SSD,0
1299,Lenovo,2 in 1 Convertible,13.3,16,Windows 10,1.3,79867,IPS Panel,Quad HD+,3200x1800,Intel,Core,i7,2.5,Intel,HD Graphics 520,HD Graphics,512,SSD,0
1300,Lenovo,Notebook,14,2,Windows 10,1.5,12201,Standard,Standard,1366x768,Intel,Celeron,N3050,1.6,Intel,HD Graphics,HD Graphics,64,Flash Storage,0
