## Exploração Inicial e Processamento de Dados

Verificação e pré-processamento dos dados para utilização nos experimentos de modelos.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/raw/laptop-price-brl.csv")
df.head()

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,Price,rating,Number of Ratings,Number of Reviews
0,ASUS,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,2321,2 stars,3,0
1,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,2613,3 stars,65,5
2,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,2680,3 stars,8,1
3,ASUS,Intel,Core i5,10th,8 GB,DDR4,512 GB,0 GB,Windows,32-bit,2 GB,Casual,No warranty,No,No,4689,3 stars,0,0
4,ASUS,Intel,Celeron Dual,Not Available,4 GB,DDR4,0 GB,512 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,1808,3 stars,0,0


## Pré-Processamento

In [3]:
df_transformed = df.copy()

In [4]:
df_transformed = df_transformed.astype(str).apply(lambda x: x.str.lower())

In [5]:
df_transformed.head()

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,Price,rating,Number of Ratings,Number of Reviews
0,asus,intel,core i3,10th,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,no,2321,2 stars,3,0
1,lenovo,intel,core i3,10th,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,no,2613,3 stars,65,5
2,lenovo,intel,core i3,10th,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,no,2680,3 stars,8,1
3,asus,intel,core i5,10th,8 gb,ddr4,512 gb,0 gb,windows,32-bit,2 gb,casual,no warranty,no,no,4689,3 stars,0,0
4,asus,intel,celeron dual,not available,4 gb,ddr4,0 gb,512 gb,windows,64-bit,0 gb,casual,no warranty,no,no,1808,3 stars,0,0


## Remoção de colunas

Colunas que não serão utilizadas no treinamento e predição por não terem poder preditor mais generalista.

In [6]:
df_transformed.drop("rating", axis=1, inplace=True)
df_transformed.drop("Number of Ratings", axis=1, inplace=True)
df_transformed.drop("Number of Reviews", axis=1, inplace=True)
df_transformed.drop("msoffice", axis=1, inplace=True)
df_transformed.drop("processor_gnrtn", axis=1, inplace=True)

In [7]:
df_transformed.head()

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,Price
0,asus,intel,core i3,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,2321
1,lenovo,intel,core i3,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,2613
2,lenovo,intel,core i3,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,2680
3,asus,intel,core i5,8 gb,ddr4,512 gb,0 gb,windows,32-bit,2 gb,casual,no warranty,no,4689
4,asus,intel,celeron dual,4 gb,ddr4,0 gb,512 gb,windows,64-bit,0 gb,casual,no warranty,no,1808


In [8]:
df_transformed["ram_gb"] = df_transformed["ram_gb"].replace({" gb": ""}, regex=True)
df_transformed["ssd"] = df_transformed["ssd"].replace({" gb": ""}, regex=True)
df_transformed["hdd"] = df_transformed["hdd"].replace({" gb": ""}, regex=True)
df_transformed["graphic_card_gb"] = df_transformed["graphic_card_gb"].replace({" gb": ""}, regex=True)

In [9]:
df_transformed.head(30)

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,Price
0,asus,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,no warranty,no,2321
1,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,no warranty,no,2613
2,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,no warranty,no,2680
3,asus,intel,core i5,8,ddr4,512,0,windows,32-bit,2,casual,no warranty,no,4689
4,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,no warranty,no,1808
5,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,no warranty,no,1540
6,asus,intel,celeron dual,4,ddr4,0,512,windows,32-bit,0,casual,no warranty,no,1473
7,asus,intel,core i5,8,ddr4,0,1024,windows,32-bit,2,casual,no warranty,no,3940
8,lenovo,intel,core i5,4,ddr4,0,1024,windows,32-bit,0,casual,no warranty,no,3350
9,acer,amd,ryzen 5,4,ddr4,0,512,windows,32-bit,4,casual,no warranty,no,4019


In [10]:
df_transformed["warranty"] = df_transformed["warranty"].replace({"no warranty": "0"}, regex=True)
df_transformed["warranty"] = df_transformed["warranty"].replace({" (years|year)": ""}, regex=True)

In [11]:
df_transformed.head(30)

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,Price
0,asus,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,no,2321
1,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,no,2613
2,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,no,2680
3,asus,intel,core i5,8,ddr4,512,0,windows,32-bit,2,casual,0,no,4689
4,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,0,no,1808
5,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,0,no,1540
6,asus,intel,celeron dual,4,ddr4,0,512,windows,32-bit,0,casual,0,no,1473
7,asus,intel,core i5,8,ddr4,0,1024,windows,32-bit,2,casual,0,no,3940
8,lenovo,intel,core i5,4,ddr4,0,1024,windows,32-bit,0,casual,0,no,3350
9,acer,amd,ryzen 5,4,ddr4,0,512,windows,32-bit,4,casual,0,no,4019


In [12]:
df_transformed["Touchscreen"] = df_transformed["Touchscreen"].replace({"yes": "1", "no": "0"}, regex=True)

In [13]:
df_transformed.head(20)

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,Price
0,asus,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2321
1,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2613
2,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2680
3,asus,intel,core i5,8,ddr4,512,0,windows,32-bit,2,casual,0,0,4689
4,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,0,0,1808
5,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,0,0,1540
6,asus,intel,celeron dual,4,ddr4,0,512,windows,32-bit,0,casual,0,0,1473
7,asus,intel,core i5,8,ddr4,0,1024,windows,32-bit,2,casual,0,0,3940
8,lenovo,intel,core i5,4,ddr4,0,1024,windows,32-bit,0,casual,0,0,3350
9,acer,amd,ryzen 5,4,ddr4,0,512,windows,32-bit,4,casual,0,0,4019


In [14]:
df_transformed = df_transformed.rename(columns={"Touchscreen": "touchscreen", "Price": "price"})

In [15]:
df_transformed.head()

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,touchscreen,price
0,asus,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2321
1,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2613
2,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2680
3,asus,intel,core i5,8,ddr4,512,0,windows,32-bit,2,casual,0,0,4689
4,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,0,0,1808


In [16]:
df_transformed["ram_gb"] = pd.to_numeric(df_transformed["ram_gb"], errors='coerce').fillna(0).astype(np.int64)
df_transformed["hdd"] = pd.to_numeric(df_transformed["hdd"], errors='coerce').fillna(0).astype(np.int64)
df_transformed["ssd"] = pd.to_numeric(df_transformed["ssd"], errors='coerce').fillna(0).astype(np.int64)
df_transformed["graphic_card_gb"] = pd.to_numeric(df_transformed["graphic_card_gb"], errors='coerce').fillna(0).astype(np.int64)
df_transformed["warranty"] = pd.to_numeric(df_transformed["warranty"], errors='coerce').fillna(0).astype(np.int64)
df_transformed["price"] = pd.to_numeric(df_transformed["price"], errors='coerce').fillna(0).astype(np.int64)
df_transformed["touchscreen"] = pd.to_numeric(df_transformed["touchscreen"], errors='coerce').fillna(0).astype(np.int64)

In [17]:
df_transformed.head()

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,touchscreen,price
0,asus,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2321
1,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2613
2,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2680
3,asus,intel,core i5,8,ddr4,512,0,windows,32-bit,2,casual,0,0,4689
4,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,0,0,1808


In [18]:
df_transformed["brand"].unique()

array(['asus', 'lenovo', 'acer', 'avita', 'hp', 'dell', 'msi', 'apple'],
      dtype=object)

In [19]:
df_transformed["processor_brand"].unique()

array(['intel', 'amd', 'm1'], dtype=object)

In [20]:
df_transformed["processor_name"].unique()

array(['core i3', 'core i5', 'celeron dual', 'ryzen 5', 'core i7',
       'core i9', 'm1', 'pentium quad', 'ryzen 3', 'ryzen 7', 'ryzen 9'],
      dtype=object)

In [21]:
df_transformed["os"].unique()

array(['windows', 'dos', 'mac'], dtype=object)

In [22]:
df_transformed["ram_type"].unique()

array(['ddr4', 'lpddr4', 'lpddr4x', 'ddr5', 'ddr3', 'lpddr3'],
      dtype=object)

In [69]:
df_transformed["weight"].unique()

array(['casual', 'thinnlight', 'gaming'], dtype=object)

In [23]:
df_transformed["os"] = df_transformed["os"].replace({"mac": "other", "dos": "other"})
df_transformed["ram_type"] = df_transformed["ram_type"].replace(
    {"lpddr4x": "other", 
     "lpddr4": "other", 
     "lpddr3": "other", 
     "ddr5": "other", 
     "ddr3": "other"}
    )
df_transformed["processor_name"] = df_transformed["processor_name"].replace(
    {
        "core i9": "other", 
        "pentium quad": "other", 
        "m1": "other", 
        "celeron dual":"other", 
        "ryzen 9": "other", 
        "ryzen 3":"other"
        }
    )
df_transformed["brand"] = df_transformed["brand"].replace(
    {
        "acer": "other", 
        "msi": "other", 
        "apple": "other", 
        "avita":"other", 
        }
    )

In [24]:
df_transformed["os"].unique()

array(['windows', 'other'], dtype=object)

In [25]:
df_transformed["ram_type"].unique()

array(['ddr4', 'other'], dtype=object)

In [26]:
df_transformed["processor_name"].unique()

array(['core i3', 'core i5', 'other', 'ryzen 5', 'core i7', 'ryzen 7'],
      dtype=object)

In [27]:
df_transformed["brand"].unique()

array(['asus', 'lenovo', 'other', 'hp', 'dell'], dtype=object)

In [28]:
len(df_transformed)

823

In [29]:
df_transformed.drop_duplicates(inplace=True)

In [30]:
len(df_transformed)

780

In [31]:
df_transformed.head()

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,touchscreen,price
0,asus,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2321
1,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2613
2,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2680
3,asus,intel,core i5,8,ddr4,512,0,windows,32-bit,2,casual,0,0,4689
4,asus,intel,other,4,ddr4,0,512,windows,64-bit,0,casual,0,0,1808


In [32]:
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["brand"], prefix="brand")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["processor_brand"], prefix="processor_brand")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["processor_name"], prefix="processor_name")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["ram_type"], prefix="ram_type")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["os"], prefix="os")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["os_bit"], prefix="os_bit")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["weight"], prefix="weight")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["touchscreen"], prefix="touchscreen")

In [33]:
df_transformed.head()

Unnamed: 0,ram_gb,ssd,hdd,graphic_card_gb,warranty,price,brand_asus,brand_dell,brand_hp,brand_lenovo,...,ram_type_other,os_other,os_windows,os_bit_32-bit,os_bit_64-bit,weight_casual,weight_gaming,weight_thinnlight,touchscreen_0,touchscreen_1
0,4,0,1024,0,0,2321,1,0,0,0,...,0,0,1,0,1,1,0,0,1,0
1,4,0,1024,0,0,2613,0,0,0,1,...,0,0,1,0,1,1,0,0,1,0
2,4,0,1024,0,0,2680,0,0,0,1,...,0,0,1,0,1,1,0,0,1,0
3,8,512,0,2,0,4689,1,0,0,0,...,0,0,1,1,0,1,0,0,1,0
4,4,0,512,0,0,1808,1,0,0,0,...,0,0,1,0,1,1,0,0,1,0


In [35]:
df_transformed.to_csv("../data/processed/laptop-price-brl-processed.csv", index=False)