In [1]:
#%matplotlib inline
#import matplotlib.pyplot as plt
#import seaborn as sns # seaborn là thư viện được xây trên matplotlib, giúp việc visualization đỡ khổ hơn
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier
from sklearn import set_config
set_config(display='diagram') # Để trực quan hóa pipeline

In [2]:
import sklearn
sklearn.__version__

'0.23.1'

In [3]:
data_df = pd.read_csv('data.csv')
data_df.head()

Unnamed: 0,producer,processor prod,processor model,cores,core base speed (GHz),ram type,ram cap (GB),ssd (GB),hdd (GB),gpu prod,gpu size (MB),screen type,screen size (inch),weight (kg),os,price(USD)
0,asus,intel,i7,8,2.3,ddr4,32,0,8192,nvidia,8192,led ips,17.3,2.9,windows home 10.00,101.0
1,hp,intel,i3,2,1.2,ddr4,4,0,128,intel,8192,led tn,14.0,1.47,windows home 10.00,311.0
2,lenovo,intel,i3,2,1.2,ddr4,4,0,128,intel,8192,led tn,15.6,1.85,windows home 10.00,319.0
3,lenovo,amd,ryzen 3,2,2.6,ddr4,4,0,128,amd,2048,led tn,15.6,1.85,windows home 10.00,341.0
4,lenovo,amd,ryzen 3,4,2.7,ddr4,4,0,128,amd,2048,led ips,14.0,1.5,windows home 10.00,349.0


---

## Khám phá dữ liệu

In [4]:
# Dữ liệu có bao nhiêu dòng, bao nhiêu cột?
data_df.shape

(941, 16)

In [5]:
# Dữ liệu có dòng bị lặp không?
data_df.index.duplicated().sum()

0

In [6]:
# Cột output hiện có kiểu dữ liệu gì?
data_df["price(USD)"].dtype

dtype('float64')

In [7]:
# Cột output có giá trị thiếu không?
data_df["price(USD)"].isna().sum()

0

## Tiền xử lý (tách các tập)

In [8]:
# Tách X và y
y_sr = data_df["price(USD)"]
X_df = data_df.drop("price(USD)", axis=1)

In [9]:
# Tách tập huấn luyện và tập validation theo tỉ lệ 70%:30%
train_X_df, val_X_df, train_y_sr, val_y_sr = train_test_split(X_df, y_sr, test_size=0.3, 
                                                              random_state=0)

## Khám phá dữ liệu (tập huấn luyện)

In [10]:
# Xem kiểu dữ liệu của các cột
train_X_df.dtypes

producer                  object
processor prod            object
processor model           object
cores                      int64
core base speed (GHz)    float64
ram type                  object
ram cap (GB)               int64
ssd (GB)                   int64
hdd (GB)                   int64
gpu prod                  object
gpu size (MB)              int64
screen type               object
screen size (inch)       float64
weight (kg)              float64
os                        object
dtype: object

Các cột đều có kiểu dữ liệu phù hợp

In [11]:
# Xét sự phân bố giá trị của các thuộc tính dạng số
num_cols = ['cores', 'core base speed (GHz)', 
            'ram cap (GB)',
            'ssd (GB)', 'hdd (GB)', 
            'gpu size (MB)', 
            'screen size (inch)',
            'weight (kg)']
df = train_X_df[num_cols]
def missing_ratio(df):
    return (df.isna().mean() * 100).round(1)
def lower_quartile(df):
    return df.quantile(0.25).round(1)
def median(df):
    return df.quantile(0.5).round(1)
def upper_quartile(df):
    return df.quantile(0.75).round(1)
df.agg([missing_ratio, 'min', lower_quartile, median, upper_quartile, 'max'])

Unnamed: 0,cores,core base speed (GHz),ram cap (GB),ssd (GB),hdd (GB),gpu size (MB),screen size (inch),weight (kg)
missing_ratio,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,2.0,1.0,4.0,0.0,0.0,128.0,11.6,0.85
lower_quartile,4.0,1.6,8.0,0.0,256.0,1700.0,14.0,1.4
median,4.0,2.1,8.0,0.0,512.0,2048.0,15.6,1.8
upper_quartile,6.0,2.6,16.0,0.0,512.0,8192.0,15.6,2.1
max,8.0,3.0,64.0,2000.0,8192.0,16384.0,17.3,4.7


In [12]:
# Xét sự phân bố giá trị của các thuộc tính không phải dạng số
pd.set_option('display.max_colwidth', 200) # Để nhìn rõ hơn
cat_cols = list(set(train_X_df.columns) - set(num_cols))
df = train_X_df[cat_cols]
def missing_ratio(df):
    return (df.isna().mean() * 100).round(1)
def num_values(df):
    return df.nunique()
def value_ratios(c):
    return dict((c.value_counts(normalize=True) * 100).round(1))
df.agg([missing_ratio, num_values, value_ratios])

Unnamed: 0,os,screen type,gpu prod,processor model,producer,processor prod,ram type
missing_ratio,0,0,0,0,0,0,0
num_values,6,3,3,6,7,2,2
value_ratios,"{'windows home 10.00': 56.2, 'windows pro 10.00': 37.1, 'linux ubuntu 0.00': 2.0, 'macos 11.00': 1.8, 'chrome os 0.00': 1.5, 'no os 0.00': 1.4}","{'led ips': 88.4, 'led tn': 10.8, 'oled': 0.8}","{'intel': 55.5, 'nvidia': 33.1, 'amd': 11.4}","{'i5': 38.0, 'i7': 37.5, 'i3': 10.2, 'ryzen 5': 8.2, 'ryzen 7': 4.1, 'ryzen 3': 2.0}","{'lenovo': 25.5, 'hp': 22.5, 'asus': 21.4, 'dell': 19.0, 'acer': 8.4, 'apple': 1.8, 'lg': 1.4}","{'intel': 85.7, 'amd': 14.3}","{'ddr4': 96.8, 'ddr3': 3.2}"


---

## Tiền xử lý (tập huấn luyện)

Các bước tiền xử lý:
- Tách `processor model` ra thành 2 cột: 1 cột gồm các processor của intel, 1 cột gồm các processor của amd; do cách tách này nên cột `processor prod` không còn cần thiết nữa
- 2 cột `producer` và `os` có nhiều giá trị khác nhau nên sẽ chọn các giá trị nhiều nhất theo `num_top_producer` và `num_top_os` các giá trị khác được thay bằng giá trị "others"

In [13]:
train_X_df.head()

Unnamed: 0,producer,processor prod,processor model,cores,core base speed (GHz),ram type,ram cap (GB),ssd (GB),hdd (GB),gpu prod,gpu size (MB),screen type,screen size (inch),weight (kg),os
116,lenovo,amd,ryzen 5,4,2.1,ddr4,8,0,256,amd,2048,led ips,14.0,1.65,windows home 10.00
76,hp,intel,i3,2,3.0,ddr4,12,0,512,intel,8192,led tn,14.0,1.47,windows pro 10.00
48,acer,intel,i3,2,2.2,ddr4,4,1000,0,intel,1700,led ips,15.6,2.1,windows home 10.00
546,lenovo,intel,i7,4,1.8,ddr4,24,0,1024,intel,1700,led ips,15.6,1.8,windows pro 10.00
263,asus,intel,i5,4,1.6,ddr3,8,0,512,intel,1700,led ips,14.0,1.4,windows home 10.00


In [14]:
# Hàm định nghĩa transformer tách giá trị của thuộc tính processor_model thành 2 cột: 
# - 1 cột gồm các processor của intel
# - 1 cột gồm các processor của amd
# Và chuyển về dạng số bằng phương pháp ranking
def split_processor_model(X):
    intel_processor_rank = {'i3': 1,
                            'i5': 2,
                            'i7': 3,
                            'ryzen 3': 0,
                            'ryzen 5': 0,
                            'ryzen 7': 0}
    intel_processor_model = X['processor model'].replace(intel_processor_rank)


    amd_processor_rank = {'i3': 0,
                          'i5': 0,
                          'i7': 0,
                          'ryzen 3': 1,
                          'ryzen 5': 2,
                          'ryzen 7': 3}
    amd_processor_model = X['processor model'].replace(amd_processor_rank)
    
    return X.assign(intel_processor_model=intel_processor_model, amd_processor_model=amd_processor_model).drop(["processor prod", "processor model"], axis=1)

In [15]:
# TEST
split_processor_model(train_X_df)

Unnamed: 0,producer,cores,core base speed (GHz),ram type,ram cap (GB),ssd (GB),hdd (GB),gpu prod,gpu size (MB),screen type,screen size (inch),weight (kg),os,intel_processor_model,amd_processor_model
116,lenovo,4,2.1,ddr4,8,0,256,amd,2048,led ips,14.0,1.65,windows home 10.00,0,2
76,hp,2,3.0,ddr4,12,0,512,intel,8192,led tn,14.0,1.47,windows pro 10.00,1,0
48,acer,2,2.2,ddr4,4,1000,0,intel,1700,led ips,15.6,2.10,windows home 10.00,1,0
546,lenovo,4,1.8,ddr4,24,0,1024,intel,1700,led ips,15.6,1.80,windows pro 10.00,3,0
263,asus,4,1.6,ddr3,8,0,512,intel,1700,led ips,14.0,1.40,windows home 10.00,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,hp,4,1.6,ddr4,64,0,512,intel,1700,led ips,13.3,1.35,windows home 10.00,2,0
192,asus,6,2.3,ddr4,8,0,512,nvidia,2048,led ips,14.0,1.15,windows home 10.00,0,2
629,asus,4,1.8,ddr4,16,0,1024,nvidia,4096,led ips,15.6,1.90,windows home 10.00,3,0
559,dell,4,1.9,ddr4,16,0,512,intel,1700,led ips,12.3,0.85,windows pro 10.00,3,0


In [16]:
class ColAdderDropper(BaseEstimator, TransformerMixin):
    def __init__(self, num_top_producers=1, num_top_os=1):
        self.num_top_producers = num_top_producers
        self.num_top_os = num_top_os
    def fit(self, X_df, y=None):
        producer_col = X_df.producer
        self.producer_counts_ = producer_col.value_counts()
        producers = list(self.producer_counts_.index)
        self.top_producers_ = producers[:max(1, min(self.num_top_producers, len(producers)))]
        
        os_col = X_df.os
        self.os_counts_ = os_col.value_counts()
        os = list(self.os_counts_.index)
        self.top_os_ = os[:max(1, min(self.num_top_os, len(os)))]
        
        return self
    def transform(self, X_df, y=None):
        X = X_df.copy()
        X.loc[:, "producer"].replace(list(set(X.producer.unique())-set(self.top_producers_)), 'others', inplace=True)
        X.loc[:, "os"].replace(list(set(X.os.unique())-set(self.top_os_)), 'others', inplace=True)
        #X.loc[X["producer"].isin(list(set(X.producer.unique())-set(self.top_producers_))), "producer"] = "others"
        #X.loc[X["os"].isin(list(set(X.os.unique())-set(self.top_os_))), "os"] = "others"
        return X

In [17]:
col_adderdropper = ColAdderDropper(num_top_producers=4, num_top_os=4)
col_adderdropper.fit(split_processor_model(train_X_df))
print(col_adderdropper.producer_counts_)
print()
print(col_adderdropper.top_producers_)
print()
print(col_adderdropper.os_counts_)
print()
print(col_adderdropper.top_os_)

lenovo    168
hp        148
asus      141
dell      125
acer       55
apple      12
lg          9
Name: producer, dtype: int64

['lenovo', 'hp', 'asus', 'dell']

windows home 10.00    370
windows pro 10.00     244
linux ubuntu 0.00      13
macos  11.00           12
chrome os  0.00        10
no os  0.00             9
Name: os, dtype: int64

['windows home 10.00', 'windows pro 10.00', 'linux ubuntu 0.00', 'macos  11.00']


In [18]:
fewer_cols_train_X_df = col_adderdropper.transform(split_processor_model(train_X_df))

In [19]:
fewer_cols_train_X_df.os.unique()

array(['windows home 10.00', 'windows pro 10.00', 'others',
       'macos  11.00', 'linux ubuntu 0.00'], dtype=object)

In [20]:
fewer_cols_train_X_df.producer.unique()

array(['lenovo', 'hp', 'others', 'asus', 'dell'], dtype=object)

In [21]:
fewer_cols_train_X_df.head()

Unnamed: 0,producer,cores,core base speed (GHz),ram type,ram cap (GB),ssd (GB),hdd (GB),gpu prod,gpu size (MB),screen type,screen size (inch),weight (kg),os,intel_processor_model,amd_processor_model
116,lenovo,4,2.1,ddr4,8,0,256,amd,2048,led ips,14.0,1.65,windows home 10.00,0,2
76,hp,2,3.0,ddr4,12,0,512,intel,8192,led tn,14.0,1.47,windows pro 10.00,1,0
48,others,2,2.2,ddr4,4,1000,0,intel,1700,led ips,15.6,2.1,windows home 10.00,1,0
546,lenovo,4,1.8,ddr4,24,0,1024,intel,1700,led ips,15.6,1.8,windows pro 10.00,3,0
263,asus,4,1.6,ddr3,8,0,512,intel,1700,led ips,14.0,1.4,windows home 10.00,2,0


In [22]:
nume_cols = ['cores', 'core base speed (GHz)', 'ram cap (GB)', 'ssd (GB)', 'hdd (GB)', 'gpu size (MB)', 'screen size (inch)', 'weight (kg)']
unorder_cate_cols = ['producer', 'ram type', 'gpu prod', 'screen type', 'os']
order_cate_cols = ['intel_processor_model', 'amd_processor_model']

unorder_cate_cols_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))
column_transformer = make_column_transformer((SimpleImputer(strategy='mean'), nume_cols),
                                            (unorder_cate_cols_transformer, unorder_cate_cols),
                                            (SimpleImputer(strategy='most_frequent'), order_cate_cols))
preprocess_pipeline = make_pipeline(FunctionTransformer(split_processor_model), 
                                    ColAdderDropper(num_top_producers=4, num_top_os=4),
                                    column_transformer, StandardScaler())

preprocessed_train_X = preprocess_pipeline.fit_transform(train_X_df)

In [23]:
preprocessed_train_X

array([[-0.31629402, -0.01180872, -0.5829241 , ..., -0.76770566,
        -1.9413367 ,  2.14479977],
       [-1.71308238,  1.69383156, -0.22124182, ...,  1.30258255,
        -0.96473154, -0.38891342],
       [-1.71308238,  0.17770686, -0.94460638, ..., -0.76770566,
        -0.96473154, -0.38891342],
       ...,
       [-0.31629402, -0.58035548,  0.14044046, ..., -0.76770566,
         0.98847878, -0.38891342],
       [-0.31629402, -0.3908399 ,  0.14044046, ...,  1.30258255,
         0.98847878, -0.38891342],
       [ 1.08049434,  0.93576921,  0.14044046, ...,  1.30258255,
         0.98847878, -0.38891342]])

In [24]:
preprocess_pipeline