In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import re
import seaborn as sb
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [43]:
data = r"C:\Users\guill\OneDrive\Documents\Vishrut\PhD applications\Projects\(ML) Laptop Price Prediction\laptopData.csv"
df = pd.read_csv(data)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1273 non-null   float64
 1   Company           1273 non-null   object 
 2   TypeName          1273 non-null   object 
 3   Inches            1273 non-null   object 
 4   ScreenResolution  1273 non-null   object 
 5   Cpu               1273 non-null   object 
 6   Ram               1273 non-null   object 
 7   Memory            1273 non-null   object 
 8   Gpu               1273 non-null   object 
 9   OpSys             1273 non-null   object 
 10  Weight            1273 non-null   object 
 11  Price             1273 non-null   float64
dtypes: float64(2), object(10)
memory usage: 122.3+ KB


Here, unnamed in a column that quite evidently doesn't contribute to the pricing. Should be removed (dropped)

In [45]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [46]:
print(len(df['Cpu'].unique()), "unique entries just in this column, that could be the result of just combinations to two separate classes of information, each containing less diversity, but creating more when combined")

119 unique entries just in this column, that could be the result of just combinations to two separate classes of information, each containing less diversity, but creating more when combined


Now it is pretty clear that a bunch of columns are noisy, and in need of feature engineering and preprocessing.

In [47]:
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

missing_percentage = df.isnull().mean() * 100 # Check the percentage of missing values in each column
print("Percentage of missing values per column:")
print(missing_percentage)

Missing values per column:
Company             30
TypeName            30
Inches              30
ScreenResolution    30
Cpu                 30
Ram                 30
Memory              30
Gpu                 30
OpSys               30
Weight              30
Price               30
dtype: int64
Percentage of missing values per column:
Company             2.302379
TypeName            2.302379
Inches              2.302379
ScreenResolution    2.302379
Cpu                 2.302379
Ram                 2.302379
Memory              2.302379
Gpu                 2.302379
OpSys               2.302379
Weight              2.302379
Price               2.302379
dtype: float64


We also see that the df has missing values. But interestingly, all the columns have the same number of missing value, that suggests that perhaps the missing values are occuring in related places and are not completely randomly distributed.

A little exploration of the df has revealed that indeed the missing values are actually missing rows, and in that case, the cleaning procedure I would adopt is to simply drop those rows. There is not risk of losing addtional information.

In [48]:
df.dropna(axis=0, inplace = True)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1273 entries, 0 to 1302
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           1273 non-null   object 
 1   TypeName          1273 non-null   object 
 2   Inches            1273 non-null   object 
 3   ScreenResolution  1273 non-null   object 
 4   Cpu               1273 non-null   object 
 5   Ram               1273 non-null   object 
 6   Memory            1273 non-null   object 
 7   Gpu               1273 non-null   object 
 8   OpSys             1273 non-null   object 
 9   Weight            1273 non-null   object 
 10  Price             1273 non-null   float64
dtypes: float64(1), object(10)
memory usage: 119.3+ KB


In [50]:
print(df.isnull().sum())

Company             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price               0
dtype: int64


In [51]:
df.head()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


The occurrance of units in columns like weight does not add any additional meaning to the df. Another similar example is 'GB'.  

In [52]:
df['Ram'] = df['Ram'].str.replace("GB", "")
df['Weight'] = df['Weight'].str.replace("kg", "")

In [53]:
df.head()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.336
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.808


In [54]:
df.ScreenResolution.value_counts()

ScreenResolution
Full HD 1920x1080                                495
1366x768                                         274
IPS Panel Full HD 1920x1080                      226
IPS Panel Full HD / Touchscreen 1920x1080         52
Full HD / Touchscreen 1920x1080                   45
1600x900                                          23
Touchscreen 1366x768                              16
Quad HD+ / Touchscreen 3200x1800                  14
IPS Panel 4K Ultra HD 3840x2160                   12
IPS Panel 4K Ultra HD / Touchscreen 3840x2160     11
4K Ultra HD / Touchscreen 3840x2160                9
4K Ultra HD 3840x2160                              7
IPS Panel 1366x768                                 7
IPS Panel Retina Display 2560x1600                 6
IPS Panel Quad HD+ / Touchscreen 3200x1800         6
Touchscreen 2560x1440                              6
IPS Panel Retina Display 2304x1440                 6
Touchscreen 2256x1504                              6
IPS Panel Touchscreen 2560x14

In [55]:
df.ScreenResolution.str.split(" ").apply(lambda x: x[0])

0            IPS
1       1440x900
2           Full
3            IPS
4            IPS
          ...   
1298         IPS
1299         IPS
1300    1366x768
1301    1366x768
1302    1366x768
Name: ScreenResolution, Length: 1273, dtype: object

In [56]:
# Extract resolution into a new column
df["Resolution"] = df.ScreenResolution.str.split(" ").apply(lambda x: x[-1])

# Remove resolution from the original column
df["ScreenResolution"] = df.apply(
    lambda row: row["ScreenResolution"].replace(row["Resolution"], "").strip(),
    axis=1
)


In [57]:
df["ScreenBreadth"] = df.Resolution.str.split("x").apply(lambda x : x[0])
df["ScreenLength"] = df.Resolution.str.split("x").apply(lambda x : x[1])

In [58]:
df.ScreenResolution.value_counts()

ScreenResolution
Full HD                                495
                                       307
IPS Panel Full HD                      231
IPS Panel Full HD / Touchscreen         52
Full HD / Touchscreen                   45
Touchscreen                             31
IPS Panel Retina Display                17
Quad HD+ / Touchscreen                  14
IPS Panel Touchscreen                   12
IPS Panel 4K Ultra HD                   12
IPS Panel 4K Ultra HD / Touchscreen     11
IPS Panel                               11
4K Ultra HD / Touchscreen                9
4K Ultra HD                              7
IPS Panel Quad HD+ / Touchscreen         6
IPS Panel Quad HD+                       5
Quad HD+                                 3
IPS Panel Touchscreen / 4K Ultra HD      2
Touchscreen / Quad HD+                   1
Touchscreen / Full HD                    1
Touchscreen / 4K Ultra HD                1
Name: count, dtype: int64

In [59]:
df.drop("ScreenResolution", axis = 1)

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,Resolution,ScreenBreadth,ScreenLength
0,Apple,Ultrabook,13.3,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,2560x1600,2560,1600
1,Apple,Ultrabook,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232,1440x900,1440,900
2,HP,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0000,1920x1080,1920,1080
3,Apple,Ultrabook,15.4,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.3360,2880x1800,2880,1800
4,Apple,Ultrabook,13.3,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.8080,2560x1600,2560,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,2 in 1 Convertible,14,Intel Core i7 6500U 2.5GHz,4,128GB SSD,Intel HD Graphics 520,Windows 10,1.8,33992.6400,1920x1080,1920,1080
1299,Lenovo,2 in 1 Convertible,13.3,Intel Core i7 6500U 2.5GHz,16,512GB SSD,Intel HD Graphics 520,Windows 10,1.3,79866.7200,3200x1800,3200,1800
1300,Lenovo,Notebook,14,Intel Celeron Dual Core N3050 1.6GHz,2,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5,12201.1200,1366x768,1366,768
1301,HP,Notebook,15.6,Intel Core i7 6500U 2.5GHz,6,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19,40705.9200,1366x768,1366,768


### Screen Resolution: Cleaning and Simplification

The "Screen Resolution" column contains noisy and inconsistent data. To organize it effectively, we can extract and categorize the key pieces of information:

- **Panel Type**: Examples include *IPS Panel* and *Touchscreen*.
- **Resolution**: Common formats include *1920x1080* and *2560x1600*.
- **Additional Features**: Such as *Retina Display* and *4K Ultra HD*.


In [60]:
# Function to simplify screen resolution
res = df['ScreenResolution']
def simplify_resolution(res):
    # Extract panel type (e.g., IPS, Touchscreen)
    panel = re.search(r'(IPS Panel|Touchscreen)', res)
    panel = panel.group(0) if panel else 'Standard'
    
    # Extract additional features (e.g., Retina, 4K)
    feature = re.search(r'(Retina Display|4K Ultra HD|Full HD|Quad HD\+)', res)
    feature = feature.group(0) if feature else 'Standard'

    return f'{panel}, {feature}'

# Apply the function
df['SimplifiedResolution'] = df['ScreenResolution'].apply(simplify_resolution)
df.drop(columns=['ScreenResolution'],inplace=True)

In [61]:
df.head()

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,Resolution,ScreenBreadth,ScreenLength,SimplifiedResolution
0,Apple,Ultrabook,13.3,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,2560x1600,2560,1600,"IPS Panel, Retina Display"
1,Apple,Ultrabook,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232,1440x900,1440,900,"Standard, Standard"
2,HP,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0,1920x1080,1920,1080,"Standard, Full HD"
3,Apple,Ultrabook,15.4,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.336,2880x1800,2880,1800,"IPS Panel, Retina Display"
4,Apple,Ultrabook,13.3,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.808,2560x1600,2560,1600,"IPS Panel, Retina Display"


Now that the ScreenResolution column has been simplified with the use of comma separated values, it would be easier to separate these three informations into three separate columns for further simplification.

In [62]:
df[['Screen Panel Type', 'Additional Screen Features']] = df['SimplifiedResolution'].str.split(', ', expand=True)
df.drop(columns=['SimplifiedResolution'], inplace=True)

In [63]:
df.head()

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,Resolution,ScreenBreadth,ScreenLength,Screen Panel Type,Additional Screen Features
0,Apple,Ultrabook,13.3,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,2560x1600,2560,1600,IPS Panel,Retina Display
1,Apple,Ultrabook,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895.5232,1440x900,1440,900,Standard,Standard
2,HP,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636.0,1920x1080,1920,1080,Standard,Full HD
3,Apple,Ultrabook,15.4,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195.336,2880x1800,2880,1800,IPS Panel,Retina Display
4,Apple,Ultrabook,13.3,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095.808,2560x1600,2560,1600,IPS Panel,Retina Display


In [64]:
df.Price = df.Price.astype(int)

In [65]:
df.Gpu.str.split(" ").apply(lambda x : x[1]).value_counts()

Gpu
HD            622
GeForce       361
Radeon        169
UHD            66
Quadro         31
Iris           14
FirePro         5
R4              1
GTX             1
R17M-M1-70      1
Graphics        1
Mali            1
Name: count, dtype: int64

In [26]:
df["GPU Brand"] = df.Gpu.str.split(" ").apply(lambda x : x[0])
df["GPU Series"] = df.Gpu.str.split(" ").apply(lambda x : x[1]) 

df = df.drop("Gpu", axis = 1)
df.head()

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Memory,OpSys,Weight,Price,Resolution,ScreenBreadth,ScreenLength,Screen Panel Type,Additional Screen Features,GPU Brand,GPU Series
0,Apple,Ultrabook,13.3,Intel Core i5 2.3GHz,8,128GB SSD,macOS,1.37,71378,2560x1600,2560,1600,IPS Panel,Retina Display,Intel,Iris
1,Apple,Ultrabook,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,macOS,1.34,47895,1440x900,1440,900,Standard,Standard,Intel,HD
2,HP,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,No OS,1.86,30636,1920x1080,1920,1080,Standard,Full HD,Intel,HD
3,Apple,Ultrabook,15.4,Intel Core i7 2.7GHz,16,512GB SSD,macOS,1.83,135195,2880x1800,2880,1800,IPS Panel,Retina Display,AMD,Radeon
4,Apple,Ultrabook,13.3,Intel Core i5 3.1GHz,8,256GB SSD,macOS,1.37,96095,2560x1600,2560,1600,IPS Panel,Retina Display,Intel,Iris


In [66]:
df = df.drop("Resolution", axis = 1)

In [67]:
df.Cpu.str.split(" ").apply(lambda x : x[1]).value_counts()

Cpu
Core          1078
Celeron         87
Pentium         30
A9-Series       17
Atom            11
A6-Series       11
E-Series         9
A12-Series       8
A10-Series       6
Ryzen            4
Xeon             4
A8-Series        4
FX               2
A4-Series        1
Cortex           1
Name: count, dtype: int64

In [68]:
df["CPU Brand"] = df.Cpu.str.split(" ").apply(lambda x : x[0])
df["CPU Series"] = df.Cpu.str.split(" ").apply(lambda x : x[1])
df["CPU Clock Speed"] = df.Cpu.str.split(" ").apply(lambda x : x[-1])

In [69]:
df = df.drop("Cpu", axis = 1)

In [70]:
df["CPU Clock Speed"] = df["CPU Clock Speed"].str.replace(r'(GHz)$', '', regex=True)

In [71]:
df.Memory.str.split(" ").value_counts()

Memory
[256GB, SSD]                             401
[1TB, HDD]                               217
[500GB, HDD]                             130
[512GB, SSD]                             116
[128GB, SSD, +, , 1TB, HDD]               92
[128GB, SSD]                              74
[256GB, SSD, +, , 1TB, HDD]               71
[32GB, Flash, Storage]                    37
[2TB, HDD]                                16
[64GB, Flash, Storage]                    14
[512GB, SSD, +, , 1TB, HDD]               14
[1TB, SSD]                                13
[256GB, SSD, +, , 2TB, HDD]               10
[1.0TB, Hybrid]                            9
[256GB, Flash, Storage]                    8
[16GB, Flash, Storage]                     7
[32GB, SSD]                                6
[180GB, SSD]                               4
[128GB, Flash, Storage]                    4
[512GB, SSD, +, , 2TB, HDD]                3
[16GB, SSD]                                3
[512GB, Flash, Storage]                    2
[1T

In [72]:
df.Memory.str.split(" ").apply(lambda x : x[0]).value_counts()

Memory
256GB    495
1TB      233
128GB    173
512GB    138
500GB    130
32GB      44
64GB      16
2TB       16
1.0TB     10
16GB      10
180GB      4
?          1
240GB      1
8GB        1
508GB      1
Name: count, dtype: int64

In [73]:
df["Memory Size"] = df.Memory.str.split(" ").apply(lambda x : x[0])

In [None]:
df["Memory Size"] = df["Memory Size"].apply(lambda x: str(float(x.replace("TB", "")) * 1024) + " GB" if "TB" in x else x)
df["Memory Size"] = df["Memory Size"].str.replace(r'(GB|TB)$', '', regex=True)

In [75]:
df["Memory Size"] = df["Memory Size"].replace("?", np.nan)
df["Memory Size"] = df["Memory Size"].interpolate(method='linear', limit_direction='both')

  df["Memory Size"] = df["Memory Size"].interpolate(method='linear', limit_direction='both')


In [76]:
df["Memory Size"].isna().sum()  # To check how many NaN values remain


1

In [77]:
df["Memory Size"] = df["Memory Size"].astype(str).str.split(".").apply(lambda x: x[0])

In [78]:
df = df.dropna(subset=["Memory Size"])

In [79]:
df["Memory Size"].value_counts()

Memory Size
256     495
1024    243
128     173
512     138
500     130
32       44
64       16
2048     16
16       10
180       4
nan       1
240       1
8         1
508       1
Name: count, dtype: int64

In [89]:
df.head()

Unnamed: 0,Company,TypeName,Inches,Ram,Memory,Gpu,OpSys,Weight,Price,ScreenBreadth,ScreenLength,Screen Panel Type,Additional Screen Features,CPU Brand,CPU Series,CPU Clock Speed,Memory Size
0,Apple,Ultrabook,13.3,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378,2560,1600,IPS Panel,Retina Display,Intel,Core,2.3,128
1,Apple,Ultrabook,13.3,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895,1440,900,Standard,Standard,Intel,Core,1.8,128
2,HP,Notebook,15.6,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636,1920,1080,Standard,Full HD,Intel,Core,2.5,256
3,Apple,Ultrabook,15.4,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195,2880,1800,IPS Panel,Retina Display,Intel,Core,2.7,512
4,Apple,Ultrabook,13.3,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095,2560,1600,IPS Panel,Retina Display,Intel,Core,3.1,256


In [111]:
df["Memory"].str.split("+").apply(lambda x: x[:]).value_counts()

Memory
[256GB SSD]                         401
[1TB HDD]                           217
[500GB HDD]                         130
[512GB SSD]                         116
[128GB SSD ,   1TB HDD]              92
[128GB SSD]                          74
[256GB SSD ,   1TB HDD]              71
[32GB Flash Storage]                 37
[2TB HDD]                            16
[64GB Flash Storage]                 14
[512GB SSD ,   1TB HDD]              14
[1TB SSD]                            13
[256GB SSD ,   2TB HDD]              10
[1.0TB Hybrid]                        9
[256GB Flash Storage]                 8
[16GB Flash Storage]                  7
[32GB SSD]                            6
[180GB SSD]                           4
[128GB Flash Storage]                 4
[512GB SSD ,   2TB HDD]               3
[16GB SSD]                            3
[512GB Flash Storage]                 2
[1TB SSD ,   1TB HDD]                 2
[128GB SSD ,   2TB HDD]               2
[256GB SSD ,   500GB HDD]        

The same feature engineering has to be applied to some other columns, notably the **memory**, **cpu** and **gpu** columns

In [150]:
df["Additional Memory"] = df['Memory'][df['Memory'].str.contains('\+')].str.split(" ").apply(lambda x : x[-2])
df["Additional Memory Type"] = df['Memory'][df['Memory'].str.contains('\+')].str.split(" ").apply(lambda x : x[-1])

In [None]:
df["Additional Memory"] = df["Additional Memory"].apply(lambda x: str(float(x.replace("TB", "")) * 1024) + " GB" if isinstance(x, str) and "TB" in x else x)

In [169]:
df["Additional Memory"] = df["Additional Memory"].apply(lambda x: re.sub(r'(GB|TB)$', '', x) if isinstance(x, str) else x)

In [171]:
df["Additional Memory"] = df["Additional Memory"].astype(str).str.split(".").apply(lambda x: x[0])

In [188]:
df["Additional Memory"] = df["Additional Memory"].fillna(0).astype(float)

In [194]:
df["Additional Memory"] = df["Additional Memory"].astype(int)

drop the GB and TB and round the digits off to integers

In [195]:
df["Additional Memory"].value_counts()

Additional Memory
0       1069
1024     183
2048      15
256        3
500        2
512        1
Name: count, dtype: int64

In [218]:
df.head()

Unnamed: 0,Company,TypeName,Inches,Ram,Memory,Gpu,OpSys,Weight,Price,ScreenBreadth,ScreenLength,Screen Panel Type,Additional Screen Features,CPU Brand,CPU Series,CPU Clock Speed,Memory Size,Additional Memory
0,Apple,Ultrabook,13.3,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378,2560,1600,IPS Panel,Retina Display,Intel,Core,2.3,128,0
1,Apple,Ultrabook,13.3,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,47895,1440,900,Standard,Standard,Intel,Core,1.8,128,0
2,HP,Notebook,15.6,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,30636,1920,1080,Standard,Full HD,Intel,Core,2.5,256,0
3,Apple,Ultrabook,15.4,16,512GB SSD,AMD Radeon Pro 455,macOS,1.83,135195,2880,1800,IPS Panel,Retina Display,Intel,Core,2.7,512,0
4,Apple,Ultrabook,13.3,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37,96095,2560,1600,IPS Panel,Retina Display,Intel,Core,3.1,256,0


In [207]:
df = df.drop(columns=["Additional Memory Type"])

In [233]:
df["Memory Type"] = df["Memory"].str.split(" ").apply(lambda x: x[1] if len(x) > 1 else None)

In [238]:
df = df.dropna(subset=["Memory Type"])

In [239]:
df = df.drop(columns=["Memory"])

In [243]:
df.rename(columns={'Price':'Price (INR)'})

Unnamed: 0,Company,TypeName,Inches,Ram,Gpu,OpSys,Weight,Price (INR),ScreenBreadth,ScreenLength,Screen Panel Type,Additional Screen Features,CPU Brand,CPU Series,CPU Clock Speed,Memory Size,Additional Memory,Memory Type
0,Apple,Ultrabook,13.3,8,Intel Iris Plus Graphics 640,macOS,1.37,71378,2560,1600,IPS Panel,Retina Display,Intel,Core,2.3,128,0,SSD
1,Apple,Ultrabook,13.3,8,Intel HD Graphics 6000,macOS,1.34,47895,1440,900,Standard,Standard,Intel,Core,1.8,128,0,Flash
2,HP,Notebook,15.6,8,Intel HD Graphics 620,No OS,1.86,30636,1920,1080,Standard,Full HD,Intel,Core,2.5,256,0,SSD
3,Apple,Ultrabook,15.4,16,AMD Radeon Pro 455,macOS,1.83,135195,2880,1800,IPS Panel,Retina Display,Intel,Core,2.7,512,0,SSD
4,Apple,Ultrabook,13.3,8,Intel Iris Plus Graphics 650,macOS,1.37,96095,2560,1600,IPS Panel,Retina Display,Intel,Core,3.1,256,0,SSD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,2 in 1 Convertible,14,4,Intel HD Graphics 520,Windows 10,1.8,33992,1920,1080,IPS Panel,Full HD,Intel,Core,2.5,128,0,SSD
1299,Lenovo,2 in 1 Convertible,13.3,16,Intel HD Graphics 520,Windows 10,1.3,79866,3200,1800,IPS Panel,Quad HD+,Intel,Core,2.5,512,0,SSD
1300,Lenovo,Notebook,14,2,Intel HD Graphics,Windows 10,1.5,12201,1366,768,Standard,Standard,Intel,Celeron,1.6,64,0,Flash
1301,HP,Notebook,15.6,6,AMD Radeon R5 M330,Windows 10,2.19,40705,1366,768,Standard,Standard,Intel,Core,2.5,1024,0,HDD


In [247]:
df["GPU Brand"] = df.Gpu.str.split(" ").apply(lambda x:x[0])

In [266]:
df["GPU Series"] = df.Gpu.str.split(" ").apply(lambda x:x[1])

In [268]:
df.drop(columns="Gpu")

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight,Price,ScreenBreadth,ScreenLength,Screen Panel Type,Additional Screen Features,CPU Brand,CPU Series,CPU Clock Speed,Memory Size,Additional Memory,Memory Type,GPU Brand,GPU Series
0,Apple,Ultrabook,13.3,8,macOS,1.37,71378,2560,1600,IPS Panel,Retina Display,Intel,Core,2.3,128,0,SSD,Intel,Iris
1,Apple,Ultrabook,13.3,8,macOS,1.34,47895,1440,900,Standard,Standard,Intel,Core,1.8,128,0,Flash,Intel,HD
2,HP,Notebook,15.6,8,No OS,1.86,30636,1920,1080,Standard,Full HD,Intel,Core,2.5,256,0,SSD,Intel,HD
3,Apple,Ultrabook,15.4,16,macOS,1.83,135195,2880,1800,IPS Panel,Retina Display,Intel,Core,2.7,512,0,SSD,AMD,Radeon
4,Apple,Ultrabook,13.3,8,macOS,1.37,96095,2560,1600,IPS Panel,Retina Display,Intel,Core,3.1,256,0,SSD,Intel,Iris
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,2 in 1 Convertible,14,4,Windows 10,1.8,33992,1920,1080,IPS Panel,Full HD,Intel,Core,2.5,128,0,SSD,Intel,HD
1299,Lenovo,2 in 1 Convertible,13.3,16,Windows 10,1.3,79866,3200,1800,IPS Panel,Quad HD+,Intel,Core,2.5,512,0,SSD,Intel,HD
1300,Lenovo,Notebook,14,2,Windows 10,1.5,12201,1366,768,Standard,Standard,Intel,Celeron,1.6,64,0,Flash,Intel,HD
1301,HP,Notebook,15.6,6,Windows 10,2.19,40705,1366,768,Standard,Standard,Intel,Core,2.5,1024,0,HDD,AMD,Radeon


Let start with cpu.

In [290]:
df.Weight = df.Weight.round(1)

In [None]:
df.Weight = df.Weight.str.replace("?", "")

In [314]:
df["Weight"] = pd.to_numeric(df["Weight"].replace("?", None), errors="coerce")

In [315]:
df["Weight"] = df["Weight"].interpolate()

In [316]:
df["Weight"] = df["Weight"].round(1)

In [382]:
df = df.drop(columns="Gpu")

In [352]:
df.Inches = df.Inches.str.replace("?", "")

In [354]:
df.Inches = df.Inches.interpolate()

  df.Inches = df.Inches.interpolate()


In [383]:
df

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight,Price,ScreenBreadth,ScreenLength,Screen Panel Type,Additional Screen Features,CPU Brand,CPU Series,CPU Clock Speed,Memory Size,Additional Memory,Memory Type,GPU Brand,GPU Series
0,Apple,Ultrabook,13.3,8,macOS,1.4,71378,2560,1600,IPS Panel,Retina Display,Intel,Core,2.3,128,0,SSD,Intel,Iris
1,Apple,Ultrabook,13.3,8,macOS,1.3,47895,1440,900,Standard,Standard,Intel,Core,1.8,128,0,Flash,Intel,HD
2,HP,Notebook,15.6,8,No OS,1.9,30636,1920,1080,Standard,Full HD,Intel,Core,2.5,256,0,SSD,Intel,HD
3,Apple,Ultrabook,15.4,16,macOS,1.8,135195,2880,1800,IPS Panel,Retina Display,Intel,Core,2.7,512,0,SSD,AMD,Radeon
4,Apple,Ultrabook,13.3,8,macOS,1.4,96095,2560,1600,IPS Panel,Retina Display,Intel,Core,3.1,256,0,SSD,Intel,Iris
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,2 in 1 Convertible,14,4,Windows 10,1.8,33992,1920,1080,IPS Panel,Full HD,Intel,Core,2.5,128,0,SSD,Intel,HD
1299,Lenovo,2 in 1 Convertible,13.3,16,Windows 10,1.3,79866,3200,1800,IPS Panel,Quad HD+,Intel,Core,2.5,512,0,SSD,Intel,HD
1300,Lenovo,Notebook,14,2,Windows 10,1.5,12201,1366,768,Standard,Standard,Intel,Celeron,1.6,64,0,Flash,Intel,HD
1301,HP,Notebook,15.6,6,Windows 10,2.2,40705,1366,768,Standard,Standard,Intel,Core,2.5,1024,0,HDD,AMD,Radeon


In [384]:
df.to_csv(r"C:\Users\guill\OneDrive\Documents\Vishrut\PhD applications\Projects\(ML) Laptop Price Prediction/edited_dataframe.csv", index=False)