In [36]:
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report

import numpy as np
import os



In [37]:
data = pd.read_csv("Week6_7/car.csv")
# ["displacement", "horsepower", "weight", "acceleration"]),
data.loc[data.horsepower == "?", "horsepower"] = np.nan

# profile = ProfileReport(data, title="Report", explorative=True)
# profile.to_file("car_report.html")

# mpg: Số dặm/gallon xăng (hiệu suất nhiên liệu)
# cylinders: Số xi-lanh động cơ (đều là 8)
# displacement: Dung tích động cơ (cc)
# horsepower: Mã lực
# weight: Trọng lượng xe (pounds)
# acceleration: Thời gian tăng tốc (giây)
# model year: Năm sản xuất (1970)
# origin: Xuất xứ (1 = Mỹ)
# car name: Tên xe



In [None]:
# regression
target = "mpg"
x = data.drop(target, axis=1)
y = data[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2024)

num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

nom_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
])

ord_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", OrdinalEncoder()),
])
					 		
preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, ["displacement", "horsepower", "weight", "acceleration"]),
    ("nom_features", nom_transformer, ["origin", "car name"]),
    ("ord_features", ord_transformer, ["cylinders", "model year"]),
])

model = Pipeline(steps=[
    ("pre_processor", preprocessor),
    ("regressor", RandomForestRegressor())
])

model.fit(x_train, y_train)

y_predict = model.predict(x_test)
print('MAE:', mean_absolute_error(y_test, y_predict))
print('RMSE:', mean_squared_error(y_test, y_predict))

MAE: 2.375912500000001
RMSE: 14.2734367375


In [None]:
#classification
data['mpg_binary'] = data['mpg'].apply(lambda x: 1 if x>=23 else 0)
data['mpg_binary'].value_counts()

target = "mpg_binary"
y = data[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2024)

num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

nom_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
])

ord_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", OrdinalEncoder()),
])
					 		
preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, ["displacement", "horsepower", "weight", "acceleration"]),
    ("nom_features", nom_transformer, ["origin", "car name"]),
    ("ord_features", ord_transformer, ["cylinders", "model year"]),
])

model2 = Pipeline(steps=[
    ("pre_processor", preprocessor),
    ("regressor", RandomForestClassifier())
])

model2.fit(x_train, y_train)

y_predict2 = model2.predict(x_test)
print(classification_report(y_test, y_predict2))


              precision    recall  f1-score   support

           0       0.88      0.95      0.91        37
           1       0.95      0.88      0.92        43

    accuracy                           0.91        80
   macro avg       0.91      0.91      0.91        80
weighted avg       0.92      0.91      0.91        80



In [None]:
#chuyen doi de so sanh regression voi classification
yp_list = []
for yp in y_predict:
    if yp < 23:
        yp_list.append(0)
    else:
        yp_list.append(1)
# print(yp_list)

print(classification_report(y_test, yp_list))

              precision    recall  f1-score   support

           0       0.83      0.95      0.89        37
           1       0.95      0.84      0.89        43

    accuracy                           0.89        80
   macro avg       0.89      0.89      0.89        80
weighted avg       0.89      0.89      0.89        80

