In [36]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns # seaborn là thư viện được xây trên matplotlib, giúp việc visualization đỡ khổ hơn
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn import set_config
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

In [2]:
df = pd.read_csv('data.csv', sep=',')


In [3]:
df.Price = pd.to_numeric(df.Price.str.replace('.', ''), errors='coerce')
df['CPUs'] = df['CPUgen'].str.extract(r'([A-z0-9\s]+)\,')
df['namegen'] = df['CPU'].str.extract(r'([A-z0-9]+)\s\(')
df['namegen'] = df['namegen'].str.extract(r'([A-z])')

In [4]:
# Tách X và y
y_sr = df["Price"] # sr là viết tắt của series
X_df = df.drop("Price", axis=1)

In [5]:
train_X_df, val_X_df, train_y_sr, val_y_sr = train_test_split(X_df, y_sr, test_size=0.3
                                                              ,random_state=0)

In [6]:
def convert_col_dtype(col):
    if col.name == 'SSD':
        col.replace('1','1000', inplace = True)
        col.replace('2','2000', inplace = True)

        return pd.to_numeric(col, errors='coerce')
    if col.name == 'gen':
        col.replace('1000',np.NaN,inplace = True)
        return pd.to_numeric(col, errors='coerce')
    if col.name == 'Security':     
        return col.apply(lambda x: 'Yes' if not pd.isnull(x) else 'No')
    if col.name in ['Pin','Weight','Screen','RAM']:  
        return pd.to_numeric(col, errors='coerce')  
    if col.name == 'Price':
        col = col.str.replace('.', '')
        return pd.to_numeric(col, errors='coerce')
    return col

In [7]:
class ColAdderDropper(BaseEstimator, TransformerMixin):
    def __init__(self, num_top_cpus= 6, num_top_namegen= 3):
        self.num_top_cpus = num_top_cpus
        self.num_top_namegen = num_top_namegen

    def fit(self, X_df, y=None):

        
        self.cpus_counts_ = X_df.CPUs.value_counts()
        cpus_ = list(self.cpus_counts_.index)
        self.top_cpus_ = cpus_[:max(1, min(self.num_top_cpus, len(cpus_)))]
        
        self.namegen_counts_ = X_df.namegen.value_counts()
        namegen_ = list(self.namegen_counts_.index)
        self.top_namegen_ = namegen_[:max(1, min(self.num_top_namegen, len(namegen_)))]        
        
        return self
    def transform(self, X_df, y=None):
        _df = X_df.copy()
        _df['CPUs'] =_df['CPUs'].apply(lambda x: x if x in col_adderdropper.top_cpus_ else 'Others')
        _df['namegen'] =_df['namegen'].apply(lambda x: x if x in col_adderdropper.top_namegen_ else 'Others')
        
        _df['gen'] = _df['CPUgen'].str.extract(r'([0-9]+)$')
        _df['GraphicChip'] = _df['GraphicChip'].str.extract(r'([A-z]+)\s')
        _df['RAM'] = _df['RAM'].str.extract(r'([0-9]+)GB')
        _df['Screen'] = _df['Screen'].str.extract(r'([0-9.]+)')
        _df['SSD'] = _df['Storage'].str.extract(r'([0-9]+)[A-z]')
        _df['Pin'] = _df['Pin'].str.extract(r'([0-9A-z]+)\scell')
        _df['Weight'] = _df['Weight'].str.extract(r'([0-9.]+)')
        
        unused_cols = ['SKU', 'Title', 'Warranty','Color','PartNum','MaxStoPortNum',
               'SupportM2','OutVideoPort','ConnectPort','Wireless','Keyboard',
               'Size','LED','Accessories','OptDrive','Feature','OS',
               'Storage', 'CPUgen','CPU','SeriesLaptop']

        _df = _df.apply(convert_col_dtype)
        _df = _df.drop(unused_cols,axis=1)
        return _df

In [8]:
# TEST FIT METHOD
col_adderdropper = ColAdderDropper(num_top_cpus= 6, num_top_namegen= 3)
col_adderdropper.fit(train_X_df)


ColAdderDropper()

In [9]:
fewer_cols_train_X_df = col_adderdropper.transform(train_X_df)

In [10]:
fewer_cols_train_X_df

Unnamed: 0,Brand,GraphicChip,RAM,Screen,Pin,Weight,Security,CPUs,namegen,gen,SSD
101,ASUS,Intel,8,13.3,3.0,1.3,Yes,Core i5,U,10.0,512
334,ACER,AMD,8,15.6,2.0,1.7,No,Ryzen 3,U,3.0,256
417,Dell,Intel,4,14.0,4.0,2.0,No,Core i3,U,8.0,1000
92,ASUS,NVIDIA,8,15.6,3.0,2.2,No,Core i5,H,8.0,1000
583,HP,NVIDIA,8,15.6,3.0,2.2,No,Core i7,H,9.0,512
...,...,...,...,...,...,...,...,...,...,...,...
763,MSI,NVIDIA,8,15.6,4.0,1.9,No,Core i5,H,9.0,512
192,ASUS,AMD,8,14.0,2.0,1.5,Yes,Ryzen 5,U,3.0,512
629,HP,Intel,8,14.0,3.0,1.5,Yes,Core i7,U,10.0,512
559,HP,Intel,4,14.0,3.0,1.7,No,Core i3,U,7.0,500


In [11]:
fewer_cols_train_X_df.CPUs.value_counts()

Core i5     261
Core i3     112
Core i7      97
Others       36
Ryzen 5      21
Ryzen 3      17
Ryzen 7      11
Name: CPUs, dtype: int64

In [12]:
nume_cols = ['RAM','gen','SSD']
unorder_cate_cols = ['GraphicChip', 'Brand','Security','CPUs','namegen']
order_cate_cols = ['Screen','Weight','Pin']
# YOUR CODE HERE
mean_numcols = SimpleImputer(missing_values = np.nan, strategy = 'mean')
mode_ordercols = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
mode_unordercols = make_pipeline(mode_ordercols, OneHotEncoder(handle_unknown='ignore'))

col_transform = ColumnTransformer([('nume_cols', mean_numcols, nume_cols),\
                                ('unorder_cate_cols', mode_unordercols,unorder_cate_cols),\
                                ('order_cate_cols',mode_ordercols,order_cate_cols)])

preprocess_pipeline = make_pipeline(col_adderdropper,col_transform,StandardScaler())
preprocessed_train_X = preprocess_pipeline.fit_transform(train_X_df)

In [13]:
preprocessed_train_X.shape

(555, 31)

In [14]:
preprocessed_val_X = preprocess_pipeline.transform(val_X_df)

In [15]:
preprocessed_val_X.shape

(239, 31)

In [30]:
def compute_mse(y, preds):
    return ((y - preds) ** 2).mean()
def compute_rr(y, preds, baseline_preds):
    return compute_mse(y, preds) / compute_mse(y, baseline_preds)
baseline_preds = train_y_sr.mean()

In [41]:
# Tạo full pipeline
nume_cols = ['RAM','gen','SSD']
unorder_cate_cols = ['GraphicChip', 'Brand','Security','CPUs','namegen']
order_cate_cols = ['Screen','Weight','Pin']
# YOUR CODE HERE

neural_net_model =  SGDRegressor(penalty='l1', random_state=0)
full_pipeline = make_pipeline(col_adderdropper, col_transform, StandardScaler(), neural_net_model)

# Thử nghiệm với các giá trị khác nhau của các siêu tham số
# và chọn ra các giá trị tốt nhất
train_errs = []
val_errs = []
alphas = [0.1, 1, 10, 100, 1000]
best_val_err = float('inf'); best_alpha = None; 
for alpha in alphas:
    full_pipeline.set_params(coladderdropper__num_top_cpus = 6,coladderdropper__num_top_namegen = 3, sgdregressor__alpha=alpha)
    full_pipeline.fit(train_X_df, train_y_sr)



    train_errs.append((full_pipeline.score(train_X_df, train_y_sr)) * 100)
    val_errs.append((full_pipeline.score(val_X_df, val_y_sr)) * 100)

    if val_errs[-1] < best_val_err:
        best_val_err = val_errs[-1]
        best_alpha = alpha
'Finish!'
print(train_errs)
print(val_errs)

[77.9408109965436, 77.94080865294092, 77.94078523586202, 77.94055063771373, 77.93795704012612]
[80.1165279081293, 80.11654473588001, 80.1167126537266, 80.11837352054569, 80.1322465940667]
