In [17]:
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report, r2_score

import numpy as np
import os

In [11]:
data = pd.read_csv("Week6_7/csgo.csv")

# 'map', 'day', 'month', 'year', 'date', 'wait_time_s', 'match_time_s', 'team_a_rounds', 'team_b_rounds', 'ping', 'kills', 'assists', 'deaths', 'mvps', 'hs_percent', 'points', 'result'

In [16]:
#classification

target = "result"
redundant = "date"
x = data.drop([target, redundant], axis=1)
y = data[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2024)

num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

nom_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
])

ord_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", OrdinalEncoder()),
])
					 		
preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, [
        'wait_time_s', 'match_time_s', 'ping', 
        'kills', 'assists', 'deaths', 
        'mvps', 'hs_percent', 'points'
    ]),
    ("nom_features", nom_transformer, [
        'map'
    ]),
    ("ord_features", ord_transformer, [
        'team_a_rounds', 
        'team_b_rounds'
    ]),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=100))
])

params = {
    "classifier__n_estimators": [50, 100, 200, 500],
    "classifier__criterion": ["gini", "entropy", "log_loss"],
    "classifier__max_depth": [None, 2, 5, 10]
}

model = GridSearchCV(pipeline, param_grid=params, scoring="recall", cv=6, verbose=1, n_jobs=4)
model.fit(x_train, y_train)

print("Best score: {}".format(model.best_score_))
print("Best param: {}".format(model.best_params_))

y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))


Fitting 6 folds for each of 48 candidates, totalling 288 fits


Traceback (most recent call last):
  File "/Users/tuantai229/Projects/VietAI-x-CoderSchool/venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tuantai229/Projects/VietAI-x-CoderSchool/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tuantai229/Projects/VietAI-x-CoderSchool/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/tuantai229/Projects/VietAI-x-CoderSchool/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(


Best score: nan
Best param: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__n_estimators': 50}
              precision    recall  f1-score   support

        Lost       0.77      0.83      0.80       105
         Tie       1.00      0.96      0.98        26
         Win       0.80      0.74      0.77        96

    accuracy                           0.81       227
   macro avg       0.86      0.84      0.85       227
weighted avg       0.81      0.81      0.81       227



Traceback (most recent call last):
  File "/Users/tuantai229/Projects/VietAI-x-CoderSchool/venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tuantai229/Projects/VietAI-x-CoderSchool/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tuantai229/Projects/VietAI-x-CoderSchool/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/tuantai229/Projects/VietAI-x-CoderSchool/venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(


In [19]:
# regression

data2 = pd.read_csv("Week6_7/csgo.csv")

# Tạo target là tỉ lệ thắng
y = data2['team_a_rounds']/(data2['team_a_rounds'] + data2['team_b_rounds'])

redundant_columns = ['date', 'result', 'team_a_rounds', 'team_b_rounds']
x = data2.drop(redundant_columns, axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2024)

num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

nom_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
])

preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, [
        'day', 'month', 'year', 
        'wait_time_s', 'match_time_s', 'ping', 
        'kills', 'assists', 'deaths', 
        'mvps', 'hs_percent', 'points'
    ]),
    ("nom_features", nom_transformer, ['map'])
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=100))
])

params = {
    "regressor__n_estimators": [100, 200, 500],
    "regressor__max_depth": [None, 2, 5],
    "regressor__min_samples_split": [2, 5]
}

scoring = {
    'neg_mean_squared_error': 'neg_mean_squared_error',
    'neg_root_mean_squared_error': 'neg_root_mean_squared_error',
    'neg_mean_absolute_error': 'neg_mean_absolute_error',
    'r2': 'r2'
}

model = GridSearchCV(
    pipeline,
    param_grid=params,
    scoring=scoring,
    refit='neg_root_mean_squared_error',  # chọn metric chính để optimize
    cv=6,
    verbose=1,
    n_jobs=4
)

model.fit(x_train, y_train)

print("\nBest parameters:", model.best_params_)
print("Best score:", model.best_score_)

y_pred = model.predict(x_test)

print("\nTest set evaluation:")
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")


Fitting 6 folds for each of 18 candidates, totalling 108 fits

Best parameters: {'regressor__max_depth': 2, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 500}
Best score: -0.19880616064877346

Test set evaluation:
MSE: 0.0395
RMSE: 0.1986
MAE: 0.1570
R2 Score: -0.0147
