In [403]:
# Importing the pandas library for data manipulation and analysis

import pandas as pd 

In [404]:
# Reading the raw CSV file into a Pandas DataFrame

df = pd.read_csv('/home/yusufbek-murodov/Documents/myfiles/Machine learning projects/webscrapping-amazon/data/raw/laptop_data.csv')
pd.set_option('display.max_rows', None)

In [405]:
# Displaying the first 5 rows of the DataFrame to inspect the data

df.head()

Unnamed: 0,brand_name,model_name,screen_size,ram,storage,cpu,operating_system,price,rating,reviews,graphics,card_description
0,HP,15s-fq5007TU,39.6 Centimetres,8 GB,512 GB,Core i3,Windows 11 Home,36990,4.0,941 ratings,Not available,Integrated
1,Dell,Vostro,15.6 Inches,8 GB,512 GB,Core i3 Family,Windows 11 Home,33990,3.7,725 ratings,Not available,Integrated
2,HP,HP Laptop,39.6 Centimetres,16 GB,512 GB,Intel Core i5,Windows 11 Home,52990,3.7,108 ratings,Not available,Integrated
3,HP,eq2100 series,39.6 Centimetres,16 GB,512 GB,AMD Ryzen 5 5500U,Windows 11 Home,41990,4.1,"1,676 ratings",Not available,Integrated
4,Dell,Vostro,15.6 Inches,8 GB,512 GB,Core i5,Windows 11 Home,43990,3.5,441 ratings,Not available,Integrated


In [406]:
# Getting the number of rows and columns in the DataFrame

df.shape

(528, 12)

In [407]:
# Listing all column names in the DataFrame

df.columns

Index(['brand_name', 'model_name', 'screen_size', 'ram', 'storage', 'cpu',
       'operating_system', 'price', 'rating', 'reviews', 'graphics',
       'card_description'],
      dtype='object')

In [408]:
df.dtypes

brand_name          object
model_name          object
screen_size         object
ram                 object
storage             object
cpu                 object
operating_system    object
price               object
rating              object
reviews             object
graphics            object
card_description    object
dtype: object

## Data Cleaning Process
1. Handle missing values.
2. Remove duplicates.
3. Fix column names.
4. Convert data types.
5. Filter out outliers.


### Filling missing values and correct Rows

In [409]:
# Brand Name

df.value_counts('brand_name')

brand_name
HP      395
Dell    133
Name: count, dtype: int64

In [410]:
# Model Name

df.value_counts('model_name')

model_name
HP Laptop                 155
Vostro                     44
Laptop                     42
Inspiron 3520              22
14-ew0116TU                21
15s-fy5010TU               21
15s-fq3066TU               21
Latitue 5490               21
HP Pavilion                21
15-fa1307TX                21
15-fa1317X                 21
15-fa1319X                 21
15-fa1333tx                21
Vostro 3430                21
5490-cr                    21
HP                         21
16-e0350ax                  1
15s-fq5007TU                1
15-fa1227tx                 1
15s-fy5011TU                1
15s-fq5190TU                1
255 G9                      1
HP Pavilion Laptop          1
Dell Laptop                 1
HP Laptop 15s-fr5010TU      1
Inspiron 3535               1
Inspiron 3530               1
Inspiron 3525               1
eq2100 series               1
Name: count, dtype: int64

In [411]:
# Screen Size

df.value_counts('screen_size')

screen_size
39.6 Centimetres    199
15.6 Inches         158
14 Inches           148
40.9                 21
35.6 Centimetres      1
16 Inches             1
Name: count, dtype: int64

In [412]:
def fix_screen_size(value):
    if 'Centimetres' in value or 'Inches' not in value:
        value = float(value.replace('Centimetres', ''))
        screen_size_inch = round((value / 2.54), 1)
        return screen_size_inch
    elif 'Inches' in value:
        value = value.replace('Inches', '')
        return value

In [413]:
df['screen_size'] = df['screen_size'].apply(fix_screen_size)

In [414]:
# Changing data type of screen size (object -> float64)

df['screen_size'] = pd.to_numeric(df['screen_size'], errors='coerce')

In [415]:
# Ram

df.value_counts('ram')

ram
16 GB    284
8 GB     243
4 GB       1
Name: count, dtype: int64

In [416]:
# Storage

df.value_counts('storage')

storage
512 GB    311
1 TB      106
Null       47
256 GB     22
16 GB      21
80 TB      21
Name: count, dtype: int64

In [417]:
# Filling Null values in storage column

df['storage'] = df['storage'].replace('Null', df['storage'].value_counts().idxmax())

In [418]:
df.value_counts('storage')

storage
512 GB    358
1 TB      106
256 GB     22
16 GB      21
80 TB      21
Name: count, dtype: int64

In [419]:
# CPU

df.value_counts('cpu')

cpu
Core i5                154
Intel Core i5          108
Core i3                 46
Ryzen 3                 44
Intel Core i3           43
Core i5 Family          42
Core i7                 22
Ryzen 7                 22
Celeron N               21
Intel Core i7           21
Athlon Silver 3050U      1
Core i3 Family           1
AMD Ryzen 5 5500U        1
AMD Ryzen 5 5600X        1
Ryzen 5                  1
Name: count, dtype: int64

In [420]:
# Operating System

df.value_counts('operating_system')

operating_system
Windows 11 Home    485
Windows 10 Pro      42
Windows 11 Pro       1
Name: count, dtype: int64

In [421]:
# Price

df.value_counts('price')

price
Null        42
33,990      22
35,990      22
1,04,990    21
35,490      21
47,999      21
50,990      21
49,750      21
36,850      21
37,990      21
27,990      21
1,11,990    21
89,990      21
66,990      21
68,990      21
67,990      21
62,490      21
79,990      21
56,990      21
52,490      21
83,990      21
45,990      21
31,800      14
31,890       6
46,990       2
60,990       2
36,990       2
21,830       1
34,499       1
28,990       1
31,490       1
31,749       1
37,490       1
39,490       1
41,990       1
43,990       1
50,999       1
48,990       1
64,990       1
55,990       1
52,990       1
61,990       1
55,780       1
72,990       1
Name: count, dtype: int64

In [422]:
def fix_price(value):
    value = value.replace(',', '')
    return value

In [423]:
df['price'] = df['price'].apply(fix_price)

In [424]:
# Changing data type of price (object -> float64)

df['price'] = pd.to_numeric(df['price'], errors='coerce')

In [425]:
df.value_counts('price')

price
35990.0     22
33990.0     22
27990.0     21
35490.0     21
49750.0     21
52490.0     21
50990.0     21
47999.0     21
45990.0     21
37990.0     21
36850.0     21
89990.0     21
68990.0     21
79990.0     21
83990.0     21
104990.0    21
66990.0     21
62490.0     21
111990.0    21
67990.0     21
56990.0     21
31800.0     14
31890.0      6
60990.0      2
36990.0      2
46990.0      2
21830.0      1
34499.0      1
31490.0      1
31749.0      1
28990.0      1
37490.0      1
39490.0      1
41990.0      1
43990.0      1
52990.0      1
50999.0      1
48990.0      1
55780.0      1
64990.0      1
61990.0      1
55990.0      1
72990.0      1
Name: count, dtype: int64

In [426]:
# Counting the number of null values in each column

df.isnull().sum()

brand_name           0
model_name           0
screen_size          0
ram                  0
storage              0
cpu                  0
operating_system     0
price               42
rating               0
reviews              0
graphics             0
card_description     0
dtype: int64

In [427]:
# Filling missing values

df['price'] = df['price'].fillna(df['price'].median())

In [428]:
df.value_counts('price')

price
50990.0     63
33990.0     22
35990.0     22
35490.0     21
47999.0     21
52490.0     21
49750.0     21
37990.0     21
45990.0     21
36850.0     21
27990.0     21
89990.0     21
68990.0     21
79990.0     21
83990.0     21
104990.0    21
66990.0     21
62490.0     21
111990.0    21
67990.0     21
56990.0     21
31800.0     14
31890.0      6
60990.0      2
36990.0      2
46990.0      2
21830.0      1
34499.0      1
31490.0      1
31749.0      1
28990.0      1
37490.0      1
39490.0      1
41990.0      1
43990.0      1
52990.0      1
50999.0      1
48990.0      1
55780.0      1
64990.0      1
61990.0      1
55990.0      1
72990.0      1
Name: count, dtype: int64

In [429]:
# Converting Indian rupee to dollar

def convert_rupee(value):
    return round(value * 0.012)

In [430]:
df['price'] = df['price'].apply(convert_rupee)

In [431]:
df.value_counts('price')

price
612     64
408     22
432     22
552     21
426     21
576     21
442     21
456     21
597     21
336     21
1344    21
1080    21
960     21
1008    21
1260    21
804     21
750     21
630     21
816     21
828     21
684     21
382     14
383      6
564      2
732      2
444      2
414      1
348      1
378      1
381      1
262      1
474      1
450      1
528      1
636      1
588      1
504      1
669      1
780      1
744      1
672      1
876      1
Name: count, dtype: int64

In [432]:
# Result of Null values 

df.isnull().sum()

brand_name          0
model_name          0
screen_size         0
ram                 0
storage             0
cpu                 0
operating_system    0
price               0
rating              0
reviews             0
graphics            0
card_description    0
dtype: int64

In [433]:
# Rating 

df.value_counts('rating')

rating
4.0     89
4.1     68
3.9     45
3.6     43
4.3     42
3.1     42
3.4     42
3.7     25
3.5     23
5.0     22
3.0     22
2.9     21
Null    21
3.8     21
1.0      1
4.2      1
Name: count, dtype: int64

In [434]:
# Changing data type of rating object -> float64

df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

In [435]:
# Null counts of rating column

df.isnull().sum()

brand_name           0
model_name           0
screen_size          0
ram                  0
storage              0
cpu                  0
operating_system     0
price                0
rating              21
reviews              0
graphics             0
card_description     0
dtype: int64

In [436]:
# Filling missing values of rating

df['rating'] = df['rating'].fillna(round((df['rating'].mean()), 1))

In [437]:
df.isnull().sum()

brand_name          0
model_name          0
screen_size         0
ram                 0
storage             0
cpu                 0
operating_system    0
price               0
rating              0
reviews             0
graphics            0
card_description    0
dtype: int64

In [438]:
# Reviews

df.value_counts('reviews')

reviews
31 ratings       63
6 ratings        42
116 ratings      21
13 ratings       21
29 ratings       21
38 ratings       21
18 ratings       21
2,009 ratings    21
1,170 ratings    21
101 ratings      21
702 ratings      21
Null             21
96 ratings       21
7 ratings        21
494 ratings      21
61 ratings       21
28 ratings       21
2 ratings        21
42 ratings       21
41 ratings       21
54 ratings       21
167 ratings       2
4 ratings         2
1 rating          2
1,391 ratings     1
1,950 ratings     1
108 ratings       1
11 ratings        1
1,676 ratings     1
441 ratings       1
192 ratings       1
23 ratings        1
232 ratings       1
246 ratings       1
675 ratings       1
48 ratings        1
49 ratings        1
714 ratings       1
771 ratings       1
725 ratings       1
941 ratings       1
88 ratings        1
Name: count, dtype: int64

In [439]:
def fix_reviews(value):
    value = value.replace('ratings', '').replace('rating', '').replace(',', '')
    return value

In [440]:
df['reviews'] = df['reviews'].apply(fix_reviews)

In [441]:
df.value_counts('reviews')

reviews
31       63
6        42
13       21
116      21
29       21
38       21
18       21
2009     21
101      21
1170     21
702      21
Null     21
96       21
7        21
494      21
61       21
28       21
2        21
42       21
41       21
54       21
4         2
167       2
1         2
11        1
108       1
1676      1
1391      1
441       1
23        1
1950      1
246       1
232       1
192       1
675       1
48        1
49        1
714       1
771       1
725       1
941       1
88        1
Name: count, dtype: int64

In [442]:
# Changing data type of reviews (object -> int64)

df['reviews'] = pd.to_numeric(df['reviews'], errors='coerce')

In [443]:
# Null counts of reviews

df.isnull().sum()

brand_name           0
model_name           0
screen_size          0
ram                  0
storage              0
cpu                  0
operating_system     0
price                0
rating               0
reviews             21
graphics             0
card_description     0
dtype: int64

In [444]:
df['reviews'] = df['reviews'].fillna(round((df['reviews'].mean()), 0))

In [445]:
df.isnull().sum()

brand_name          0
model_name          0
screen_size         0
ram                 0
storage             0
cpu                 0
operating_system    0
price               0
rating              0
reviews             0
graphics            0
card_description    0
dtype: int64

In [446]:
# Graphics

df.value_counts('graphics')

graphics
Not available         506
Intel                  21
Intel UHD Graphics      1
Name: count, dtype: int64

In [447]:
# Card Description

df.value_counts('card_description')

card_description
Integrated    399
Dedicated     129
Name: count, dtype: int64

In [448]:
df.head()

Unnamed: 0,brand_name,model_name,screen_size,ram,storage,cpu,operating_system,price,rating,reviews,graphics,card_description
0,HP,15s-fq5007TU,15.6,8 GB,512 GB,Core i3,Windows 11 Home,444,4.0,941.0,Not available,Integrated
1,Dell,Vostro,15.6,8 GB,512 GB,Core i3 Family,Windows 11 Home,408,3.7,725.0,Not available,Integrated
2,HP,HP Laptop,15.6,16 GB,512 GB,Intel Core i5,Windows 11 Home,636,3.7,108.0,Not available,Integrated
3,HP,eq2100 series,15.6,16 GB,512 GB,AMD Ryzen 5 5500U,Windows 11 Home,504,4.1,1676.0,Not available,Integrated
4,Dell,Vostro,15.6,8 GB,512 GB,Core i5,Windows 11 Home,528,3.5,441.0,Not available,Integrated


In [449]:
df.dtypes

brand_name           object
model_name           object
screen_size         float64
ram                  object
storage              object
cpu                  object
operating_system     object
price                 int64
rating              float64
reviews             float64
graphics             object
card_description     object
dtype: object

In [450]:
df.isnull().sum()

brand_name          0
model_name          0
screen_size         0
ram                 0
storage             0
cpu                 0
operating_system    0
price               0
rating              0
reviews             0
graphics            0
card_description    0
dtype: int64

In [452]:
# Save the CSV File 

df.to_csv('cleaned_data.csv', index=False)