In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error

In [2]:
df = pd.read_csv('dulieu/laptop_price - dataset.csv')
df.head(5)

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,CPU_Company,CPU_Type,CPU_Frequency (GHz),RAM (GB),Memory,GPU_Company,GPU_Type,OpSys,Weight (kg),Price (Euro)
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel,Core i5,2.3,8,128GB SSD,Intel,Iris Plus Graphics 640,macOS,1.37,1339.69
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel,Core i5,1.8,8,128GB Flash Storage,Intel,HD Graphics 6000,macOS,1.34,898.94
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel,Core i5 7200U,2.5,8,256GB SSD,Intel,HD Graphics 620,No OS,1.86,575.0
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel,Core i7,2.7,16,512GB SSD,AMD,Radeon Pro 455,macOS,1.83,2537.45
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel,Core i5,3.1,8,256GB SSD,Intel,Iris Plus Graphics 650,macOS,1.37,1803.6


In [3]:
df.describe()

Unnamed: 0,Inches,CPU_Frequency (GHz),RAM (GB),Weight (kg),Price (Euro)
count,1275.0,1275.0,1275.0,1275.0,1275.0
mean,15.022902,2.30298,8.440784,2.040525,1134.969059
std,1.42947,0.503846,5.097809,0.669196,700.752504
min,10.1,0.9,2.0,0.69,174.0
25%,14.0,2.0,4.0,1.5,609.0
50%,15.6,2.5,8.0,2.04,989.0
75%,15.6,2.7,8.0,2.31,1496.5
max,18.4,3.6,64.0,4.7,6099.0


In [4]:
df.isnull().sum()

Company                0
Product                0
TypeName               0
Inches                 0
ScreenResolution       0
CPU_Company            0
CPU_Type               0
CPU_Frequency (GHz)    0
RAM (GB)               0
Memory                 0
GPU_Company            0
GPU_Type               0
OpSys                  0
Weight (kg)            0
Price (Euro)           0
dtype: int64

In [5]:
df.isna().sum()

Company                0
Product                0
TypeName               0
Inches                 0
ScreenResolution       0
CPU_Company            0
CPU_Type               0
CPU_Frequency (GHz)    0
RAM (GB)               0
Memory                 0
GPU_Company            0
GPU_Type               0
OpSys                  0
Weight (kg)            0
Price (Euro)           0
dtype: int64

In [6]:
import re

onehotencoder_columns = ['Company', 'Product', 'TypeName', 'GPU_Company', 'GPU_Type', 'OpSys', 'CPU_Company', 'CPU_Type']
for column in onehotencoder_columns:
    encoder = OneHotEncoder(sparse_output=False)
    df_encoded = encoder.fit_transform(df[[column]])
    df_encoded = pd.DataFrame(df_encoded, columns=encoder.get_feature_names_out(encoder.feature_names_in_))

    df = pd.concat([df.drop(column, axis=1), df_encoded], axis=1)

def getMemory(text):    
    pattern = r'\d{3}'
    match = re.search(pattern, text)
    if match:
        memory = match.group()
        return memory
    return ''

df['Memory'] = df['Memory'].apply(getMemory)

def getResolution(text):    
    pattern = r'\d{3,4}x\d{3,4}'
    match = re.search(pattern, text)
    if match:
        resolution = match.group()
        return resolution
    return ''

df['ScreenResolution'] = df['ScreenResolution'].apply(getResolution)

screen_encoder = LabelEncoder()
df['ScreenResolution'] = screen_encoder.fit_transform(df['ScreenResolution'])

df.rename(columns={'CPU_Frequency (GHz)': 'CPU_Frequency', 'RAM (GB)': 'Ram', 'Weight (kg)': 'Weight', 'Price (Euro)': 'Price'}, inplace=True)

df = pd.concat([df.drop('Price', axis=1), df['Price']], axis=1)

df.head(5)

Unnamed: 0,Inches,ScreenResolution,CPU_Frequency,Ram,Memory,Weight,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,...,CPU_Type_Pentium Dual Core N4200,CPU_Type_Pentium Quad Core N3700,CPU_Type_Pentium Quad Core N3710,CPU_Type_Pentium Quad Core N4200,CPU_Type_Ryzen 1600,CPU_Type_Ryzen 1700,CPU_Type_Xeon E3-1505M V6,CPU_Type_Xeon E3-1535M v5,CPU_Type_Xeon E3-1535M v6,Price
0,13.3,10,2.3,8,128,1.37,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1339.69
1,13.3,1,1.8,8,128,1.34,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,898.94
2,15.6,3,2.5,8,256,1.86,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,575.0
3,15.4,12,2.7,16,512,1.83,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2537.45
4,13.3,10,3.1,8,256,1.37,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1803.6


In [7]:
df['Memory'] = df['Memory'].replace('', 0)
df['Memory'] = pd.to_numeric(df['Memory'], errors='coerce')
df['Memory'] = df['Memory'].fillna(0).astype(int)
df['Memory']

normalize_columns = ['Inches', 'ScreenResolution', 'CPU_Frequency', 'Ram', 'Memory', 'Weight']
for column in normalize_columns:
    scaller = StandardScaler()
    df[column].values.astype(float)
    df[column] = scaller.fit_transform(df[[column]])

df.head(5)

Unnamed: 0,Inches,ScreenResolution,CPU_Frequency,Ram,Memory,Weight,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,...,CPU_Type_Pentium Dual Core N4200,CPU_Type_Pentium Quad Core N3700,CPU_Type_Pentium Quad Core N3710,CPU_Type_Pentium Quad Core N4200,CPU_Type_Ryzen 1600,CPU_Type_Ryzen 1700,CPU_Type_Xeon E3-1505M V6,CPU_Type_Xeon E3-1535M v5,CPU_Type_Xeon E3-1535M v6,Price
0,-1.205746,2.241164,-0.005918,-0.086499,-0.547674,-1.00238,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1339.69
1,-1.205746,-0.678609,-0.998674,-0.086499,-0.547674,-1.047227,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,898.94
2,0.403873,-0.02977,0.391185,-0.086499,0.181795,-0.269871,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,575.0
3,0.263906,2.890003,0.788288,1.483418,1.640732,-0.314718,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2537.45
4,-1.205746,2.241164,1.582493,-0.086499,0.181795,-1.00238,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1803.6


In [8]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [9]:
model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f'r_2 score: {r2_score(y_test, y_pred)}')
print(f'mae: {mean_absolute_error(y_test, y_pred)}')
print(f'mape: {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'rmse: {root_mean_squared_error(y_test, y_pred)}')

r_2 score: -5.100055915914972e+22
mae: 52499934725912.54
mape: 116526715226.39299
rmse: 158668670553067.44


In [None]:
sns.regplot(df, x='Inches', y='Price')