In [17]:
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report, r2_score

import numpy as np
import os

In [11]:
data = pd.read_csv("Week6_7/csgo.csv")

# 'map', 'day', 'month', 'year', 'date', 'wait_time_s', 'match_time_s', 'team_a_rounds', 'team_b_rounds', 'ping', 'kills', 'assists', 'deaths', 'mvps', 'hs_percent', 'points', 'result'

In [20]:
#classification

target = "result"
redundant = "date"
x = data.drop([target, redundant], axis=1)
y = data[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2024)

num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

nom_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
])

ord_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", OrdinalEncoder()),
])
					 		
preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, [
        'wait_time_s', 'match_time_s', 'ping', 
        'kills', 'assists', 'deaths', 
        'mvps', 'hs_percent', 'points'
    ]),
    ("nom_features", nom_transformer, [
        'map'
    ]),
    ("ord_features", ord_transformer, [
        'team_a_rounds', 
        'team_b_rounds'
    ]),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=100))
])

params = {
    "classifier__n_estimators": [50, 100, 200, 500],
    "classifier__criterion": ["gini", "entropy", "log_loss"],
    "classifier__max_depth": [None, 2, 5, 10]
}

model = GridSearchCV(pipeline, param_grid=params, scoring="accuracy", cv=6, verbose=1, n_jobs=4)
model.fit(x_train, y_train)

print("Best score: {}".format(model.best_score_))
print("Best param: {}".format(model.best_params_))

y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))


Fitting 6 folds for each of 48 candidates, totalling 288 fits
Best score: 0.8145695364238411
Best param: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__n_estimators': 200}
              precision    recall  f1-score   support

        Lost       0.79      0.85      0.82       105
         Tie       1.00      1.00      1.00        26
         Win       0.82      0.76      0.79        96

    accuracy                           0.83       227
   macro avg       0.87      0.87      0.87       227
weighted avg       0.83      0.83      0.83       227



In [21]:
# regression

data2 = pd.read_csv("Week6_7/csgo.csv")

# Tạo target là tỉ lệ thắng
y = data2['points']

redundant_columns = ['date']
x = data2.drop(redundant_columns, axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2024)

num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

nom_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
])

preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, [
        'day', 'month', 'year', 
        'wait_time_s', 'match_time_s', 'ping', 
        'kills', 'assists', 'deaths', 
        'mvps', 'hs_percent', 'points'
    ]),
    ("nom_features", nom_transformer, ['map'])
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=100))
])

params = {
    "regressor__n_estimators": [100, 200, 500],
    "regressor__max_depth": [None, 2, 5],
    "regressor__min_samples_split": [2, 5]
}

scoring = {
    'neg_mean_squared_error': 'neg_mean_squared_error',
    'neg_root_mean_squared_error': 'neg_root_mean_squared_error',
    'neg_mean_absolute_error': 'neg_mean_absolute_error',
    'r2': 'r2'
}

model = GridSearchCV(
    pipeline,
    param_grid=params,
    scoring=scoring,
    refit='neg_root_mean_squared_error',  # chọn metric chính để optimize
    cv=6,
    verbose=1,
    n_jobs=4
)

model.fit(x_train, y_train)

print("\nBest parameters:", model.best_params_)
print("Best score:", model.best_score_)

y_pred = model.predict(x_test)

print("\nTest set evaluation:")
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")


Fitting 6 folds for each of 18 candidates, totalling 108 fits

Best parameters: {'regressor__max_depth': None, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
Best score: -0.2769184532271449

Test set evaluation:
MSE: 0.2603
RMSE: 0.5102
MAE: 0.0901
R2 Score: 0.9990
