In [None]:
# RandomForest
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, r2_score

# Step1: Read input
data = pd.read_csv("../datasets/StudentScore.xls")

# Create report
# profile = ProfileReport(data, title="Score Report", explorative=True)
# profile.to_file("Score.html")

# print(data[["math score", "reading score", "writing score"]].corr())

# Step2: Divide the dataset and data preprocessing
target = "writing score"
x = data.drop(target, axis=1)  # axis=0: get row
y = data[target]

# Chia du lieu thanh train, test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2201)  # random_state: Đảm bảo mỗi lần chia train và test không đổi
# x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=2201)
pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])

# Dien o bi khuyet va tien xu li du lieu, su dung pipeline de khong phai fit_transform nhieu lan
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

# output = num_transformer.fit_transform(x_train[["math score", "reading score"]])
# for i, j in zip(x_train[["math score", "reading score"]].values, output):
#     print("Before {}. After {}".format(i, j))
    
education_levels = ["some high school", "high school", "some college", "associate's degree", "bachelor's degree", "master's degree"]
genders = data["gender"].unique()
lunch_values = data["lunch"].unique()
test_values = data["test preparation course"].unique()

ordinal_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", OrdinalEncoder(categories=[education_levels, genders, lunch_values, test_values])),
])

# output = ordinal_transformer.fit_transform(x_train[["parental level of education"]])
# for i, j in zip(x_train[["parental level of education"]].values, output):
#     print("Before {}. After {}".format(i, j))
    
nominal_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(sparse_output=False)),
])

# output = ordinal_transformer.fit_transform(x_train[["parental level of education", "gender", "lunch", "test preparation course"]])
# print(output)
# for i, j in zip(x_train[["race/ethnicity"]].values, output):
#     print("Before {}. After {}".format(i, j))
    
transformers = ColumnTransformer(transformers=[
    ("num_features", num_transformer, ["reading score", "math score"]),
    ("ord_features", ordinal_transformer, ["parental level of education", "gender", "lunch", "test preparation course"]),
    ("nom_features", nominal_transformer, ["race/ethnicity"])
])

output = transformers.fit_transform(x_train)
# for i, j in zip(x_train.values, output):
#     print("Before {}. After {}".format(i, j))

# Step3: Train model
# reg = Pipeline(steps=[
#     ("preprocessor", transformers),
#     ("regressor", RandomForestRegressor())
# ])

# params = {
#     "regressor __ n_estimators": [50, 100, 200],
#     "regressor __ criterion": ["squared_error", "absolute_error", "friedman_mse"],
#     "preprocessor __ num_features __ imputer_strategy": ["mean", "median"]
# }

# model = GridSearchCV(
#     estimator=reg,
#     param_grid=params,   # param_distributions=params (dung cho RandomizedSearchCV)
#     n_iter=30   # so to hop muon thu (dung cho RandomizedSearchCV)
#     scoring="r2",
#     cV=6,
#     verbose=2,
#     n_jobs =- 1
# )

model = Pipeline(steps=[
    ("preprocessor", transformers),
    ("regressor", LinearRegression())
])

model.fit(x_train, y_train)

# print(model.best_params_)
# print(model.best_score_)

y_redict = model.predict(x_test)

# for i, j in zip(y_redict, y_test):
#     print("Prediction: {}. Actual value: {}".format(i, j))

print("MAE: {}".format(mean_absolute_error(y_test, y_redict)))
print("MSE: {}".format(mean_squared_error(y_test, y_redict)))
print("RMSE: {}".format(root_mean_squared_error(y_test, y_redict)))
print("R2: {}".format(r2_score(y_test, y_redict)))

# LinearRegression
# MAE: 2.9679477014885602
# MSE: 13.450248621196536
# RMSE: 3.6674580599096887
# R2: 0.9451155368159437

# RandomForest
# MAE: 3.1993066666666667
# MSE: 16.608628982222225
# RMSE: 4.075368570107767
# R2: 0.9322275958173825

# SVC
# MAE: 5.345
# MSE: 49.475
# RMSE: 7.033846742714829
# R2: 0.7981146005173531

MAE: 5.345
MSE: 49.475
RMSE: 7.033846742714829
R2: 0.7981146005173531
