In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn import set_config
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
set_config(display='diagram') # Để trực quan hóa pipeline

In [2]:
# xử lí cột Cost thành dạng numeric
data_df = pd.read_csv("./Data/Laptops.csv")
data_df.head()
data_df.Cost = data_df.Cost.str.replace(".", "")
data_df.Cost=data_df.Cost.str[:-1]
data_df.Cost =pd.to_numeric(data_df.Cost)
data_df.Cost.value_counts()
# dán nhãn cho cột Cost
def set_label_col(df, col):
    step=3
    labels = range(0, 14, 1)
    start = 4
    end = start
    
    for each in labels:
        end = start+step
        df.loc[(start*(10**6) < df[col])  & (df[col] <= end*(10**6)), "Cost"] = each
        start = end
        
    
    df.loc[df[col] > end*(10**6), col] = 15
set_label_col(data_df, "Cost")
data_df.Cost.value_counts()

3     131
4     114
2      97
5      75
6      73
7      46
8      28
10     24
1      24
15     21
9      18
11     10
12      9
0       7
13      5
Name: Cost, dtype: int64

In [3]:
y_sr = data_df["Cost"]
X_df = data_df.drop("Cost", axis=1)
train_X_df, val_X_df, train_y_sr, val_y_sr = train_test_split(X_df, y_sr, test_size=0.2, 
                                                              stratify=y_sr, random_state=0)
X_df.shape

(682, 30)

In [4]:
pd.set_option('display.max_colwidth', 200) # Để nhìn rõ hơn
def missing_ratio(df):
    return (df.isna().mean() * 100).round(1)
def num_values(df):
    return df.nunique()
def value_ratios(c):
    return dict((c.value_counts(normalize=True) * 100).round(1))
info = X_df.agg([missing_ratio, num_values, value_ratios])
drop_cols = ["Bảo hành"]
for col in X_df.columns:
    if info.loc['missing_ratio'][col] > 10:
        drop_cols.append(col)

# Xử lí tập training

In [5]:
# drop_cols = ['Bảo hành', 'Bảo mật', 'Tính năng', 'Phụ kiện đi kèm', 'Ổ đĩa quang', 'Mô tả bảo hành', 'Mic', 
#             'Kích thước', 'Pin', 'Khối lượng', 'Kiểu khe M.2 hỗ trợ']
class ColAdderDropper(BaseEstimator, TransformerMixin):
    def __init__(self, num_top_titles=1, col_titles = 'Series laptop', drop_columns = ['Pin', 'Khối lượng']):
        self.num_top_titles = num_top_titles
        self.col_titles = str(col_titles)
        self.drop_columns = drop_cols
    def fit(self, X_df, y=None):
        title_col = X_df[self.col_titles]
        self.title_counts_ = title_col.value_counts()
        titles = list(self.title_counts_.index)
        self.top_titles_ = titles[:max(1, min(self.num_top_titles, len(titles)))]
        return self
    def transform(self, X_df, y=None):
        new_df = X_df.copy()
        new_df.drop(["Name"], axis=1, inplace=True)
        new_df["Title"] = new_df[self.col_titles]
        new_df.drop([self.col_titles], axis=1, inplace=True)
        try:
            new_df = new_df.drop(self.drop_columns, axis =1)
            self.drop_columns.clear()
        except:
            pass
        
        for each in self.title_counts_.keys():
            if each not in self.top_titles_:
                new_df.loc[new_df['Title'] == each, 'Title'] = "Others"     
        return new_df


In [6]:
col_title = "Series laptop"
num_top_titles = 20
columns = list(set(X_df.columns) - set(drop_cols+["Name", col_title])) + ["Title"] 
print(columns)
steps = [('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
      ('encoder', OneHotEncoder(handle_unknown="ignore"))]

tsf = Pipeline(steps)
transformer = [('tsf', tsf, columns)]
transfer = ColumnTransformer(transformer)
steps = [('col_adderdropper', ColAdderDropper(num_top_titles=num_top_titles, col_titles = col_title)),
        ('transfer', transfer)]
preprocess_pipeline = Pipeline(steps)
preprocess_pipeline
preprocessed_train_X = preprocess_pipeline.fit_transform(train_X_df)


['Cổng kết nối', 'Đèn LED trên máy', 'Bàn phím', 'Lưu trữ', 'Màn hình', 'Cổng xuất hình', 'Chip đồ họa', 'Hệ điều hành', 'RAM', 'Kết nối không dây', 'Màu sắc', 'Số cổng lưu trữ tối đa', 'Thương hiệu', 'Thế hệ CPU', 'CPU', 'URL', 'Khối lượng', 'Kiểu khe M.2 hỗ trợ', 'Pin', 'Kích thước', 'Part-number', 'Title']


# Xử lí tập validation và mô hình tốt nhất neural netwok

In [7]:
preprocessed_val_X = preprocess_pipeline.fit_transform(val_X_df)
mlpregressor = MLPRegressor(hidden_layer_sizes=(100), solver = 'lbfgs', 
        learning_rate = 'adaptive', random_state=0, max_iter=500, early_stopping = True, verbose = 1)
steps = [('col_adderdropper', ColAdderDropper(num_top_titles=num_top_titles, col_titles = col_title)),
        ('transfer', transfer), ('classifer', mlpregressor)]
full_pipeline = Pipeline(steps)
train_errs = []
val_errs = []

num_top_titles_s = [1, 10, 20, 30, 40, 50]
best_val_err = float('inf'); best_alpha = None; best_num_top_titles = None

for num_top_titles in num_top_titles_s:
        full_pipeline.set_params(col_adderdropper__num_top_titles=num_top_titles)
        full_pipeline.fit(train_X_df, train_y_sr)
        train_err = (1 - full_pipeline.score(train_X_df, train_y_sr))*100
        val_err = (1 - full_pipeline.score(val_X_df, val_y_sr))*100
        if val_err < best_val_err:
            best_val_err = val_err
            best_num_top_titles = num_top_titles
        train_errs.append(train_err)
        val_errs.append(val_err)

In [8]:
full_pipeline.set_params(col_adderdropper__num_top_titles=best_num_top_titles)
full_pipeline.fit(X_df, y_sr)
print(best_num_top_titles)
print(best_val_err)
# best_num_top_titles

20
13.227295033575025


# TESTING

In [9]:
test_df = pd.read_csv("./Data/test.csv")
test_y_sr = test_df["Cost"]
test_X_df = test_df.drop("Cost", axis = 1)

In [10]:
pred_y = full_pipeline.predict(test_X_df)
test_err = (1 - full_pipeline.score(test_X_df, test_y_sr))*100
test_err

12.21773214896038