# Data Cleaning Using pandas

In [410]:
# import libraries
import pandas as pd

In [411]:
df = pd.read_csv('laptops.csv')
df.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360


In [412]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


In [413]:
df.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', ' Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price (Euros)'],
      dtype='object')

In [414]:
# Renaming the columns
def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col

new_columns = []
for column in df.columns:
    clean_c = clean_col(column)
    new_columns.append(clean_c)
    
df.columns = new_columns
df.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight',
       'price_euros'],
      dtype='object')

In [415]:
df['ram'].unique()

array(['8GB', '16GB', '4GB', '2GB', '12GB', '6GB', '32GB', '24GB', '64GB'],
      dtype=object)

In [416]:
df['weight'].str.isnumeric().sum()

0

In [417]:
df['ram'] = df['ram'].str.replace('GB', '').astype('int64')
df['screen_size'] = df['screen_size'].str.replace('"', '').astype('float64')
df['price_euros'] = df['price_euros'].str.replace(',', '.').astype('float64')
df['weight'] = df['weight'].str.replace('kg', '').str.replace('s', '').astype('float64')

In [418]:
df.tail()

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
1298,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4,128GB SSD,Intel HD Graphics 520,Windows,10,1.8,638.0
1299,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16,512GB SSD,Intel HD Graphics 520,Windows,10,1.3,1499.0
1300,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2,64GB Flash Storage,Intel HD Graphics,Windows,10,1.5,229.0
1301,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6,1TB HDD,AMD Radeon R5 M330,Windows,10,2.19,764.0
1302,Asus,X553SA-XX031T (N3050/4GB/500GB/W10),Notebook,15.6,1366x768,Intel Celeron Dual Core N3050 1.6GHz,4,500GB HDD,Intel HD Graphics,Windows,10,2.2,369.0


In [419]:
df.rename({'ram': 'ram_gb', 'weight': 'weight_kg'}, axis=1, inplace=True)
df.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram_gb', 'storage', 'gpu', 'os', 'os_version', 'weight_kg',
       'price_euros'],
      dtype='object')

In [420]:
cpu_manufacturer = df['cpu'].str.split().str[0]
cpu_manufacturer.value_counts()

cpu
Intel      1240
AMD          62
Samsung       1
Name: count, dtype: int64

In [421]:

mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}

df['os'] = df['os'].map(mapping_dict)
df['os'].value_counts()

os
Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          21
Android         2
Name: count, dtype: int64

In [422]:
df.isnull().sum()

manufacturer      0
model_name        0
category          0
screen_size       0
screen            0
cpu               0
ram_gb            0
storage           0
gpu               0
os                0
os_version      170
weight_kg         0
price_euros       0
dtype: int64

In [423]:
df.loc[df['os_version'].isnull(), 'os'].value_counts()

os
No OS        66
Linux        62
Chrome OS    27
macOS        13
Android       2
Name: count, dtype: int64

In [424]:
df.loc[df['os'] == 'macOS', 'os_version'] = 'X'
df.loc[df['os'] == 'No OS', 'os_version'] = 'Version Unknown'
df.loc[df['os'] == 'Linux', 'os_version'] = 'Version Unknown'
df.loc[df['os'] == 'Chrome OS', 'os_version'] = 'Version Unknown'
df.loc[df['os'] == 'Android', 'os_version'] = 'Version Unknown'

In [425]:
df.isnull().sum()

manufacturer    0
model_name      0
category        0
screen_size     0
screen          0
cpu             0
ram_gb          0
storage         0
gpu             0
os              0
os_version      0
weight_kg       0
price_euros     0
dtype: int64

In [426]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   manufacturer  1303 non-null   object 
 1   model_name    1303 non-null   object 
 2   category      1303 non-null   object 
 3   screen_size   1303 non-null   float64
 4   screen        1303 non-null   object 
 5   cpu           1303 non-null   object 
 6   ram_gb        1303 non-null   int64  
 7   storage       1303 non-null   object 
 8   gpu           1303 non-null   object 
 9   os            1303 non-null   object 
 10  os_version    1303 non-null   object 
 11  weight_kg     1303 non-null   float64
 12  price_euros   1303 non-null   float64
dtypes: float64(3), int64(1), object(9)
memory usage: 132.5+ KB


In [427]:
# df.to_csv('cleaned_laptops.csv', index=False)