In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Load the cleaned dataset
data = pd.read_csv('cleaned_data.csv')

# Create 'is_paid' feature: 0 for free, 1 for paid
data['is_paid'] = data['Pricing'].apply(lambda x: 0 if x == 0.0 else 1)

# Extract 'Month' from 'Release Date'
data['Release Date'] = pd.to_datetime(data['Release Date'], errors='coerce')
data['Release Month'] = data['Release Date'].dt.month

# Filter only free games (is_paid == 0)
data_free = data[data['is_paid'] == 0]

# Select features and target, excluding 'Pricing'
features = ['Game Genre', 'Developer', 'Release Month']
target = 'Rating'

X = data_free[features]
y = data_free[target]

# Reset the index of y to align with X
y = y.reset_index(drop=True)

# Define categorical and numerical features
categorical_features = ['Game Genre', 'Developer']
numerical_features = ['Release Month']

# Preprocessing pipelines for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)

# Apply preprocessing to the features
X_preprocessed = preprocessor.fit_transform(X)

# Convert the preprocessed features to a DataFrame
encoded_cat_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
encoded_num_features = numerical_features
all_features = list(encoded_cat_features) + encoded_num_features

X_preprocessed_df = pd.DataFrame(X_preprocessed.toarray(), columns=all_features)

# Reset the index of X_preprocessed_df to align with y
X_preprocessed_df = X_preprocessed_df.reset_index(drop=True)

# Split data into training and temporary sets (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_preprocessed_df, y, test_size=0.2, random_state=18
)

# Split temporary set into validation and test sets (50% each of temp -> 10% each of original)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=18
)

# Display the sizes of the splits
print(f'Training set size: {X_train.shape[0]} samples')
print(f'Validation set size: {X_valid.shape[0]} samples')
print(f'Test set size: {X_test.shape[0]} samples')


Training set size: 16585 samples
Validation set size: 2073 samples
Test set size: 2074 samples


In [17]:
# Cell 3: Define Parameter Grids for Regression Models
import itertools
# Parameter grids for each model
param_grids = {
    'Ridge': {
        'fit_intercept': [True, False],
        'normalize': [True, False],
        'alpha': [0.1, 1]
    },
    'Lasso': {
        'alpha': [0.01, 0.1, 1, 10]
    },
    'xgboost':{
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'reg_alpha': [0, 0.1, 1],
        'reg_lambda': [1, 10]
    },
    'decision_tree': {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [ None, 'sqrt', 'log2']
    }
}

In [18]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Hàm cross-validation cho hồi quy
def cross_validate_regression(model, X, y, k=5):
    fold_size = len(X) // k
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    scores = {'mse': [], 'rmse': [], 'r2': []}
    
    for fold in range(k):
        start = fold * fold_size
        end = start + fold_size if fold != k-1 else len(X)
        val_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])
        
        # Convert X và y thành numpy arrays nếu là pandas DataFrame hoặc Series
        if isinstance(X, (pd.DataFrame, pd.Series)):
            X = X.values
        if isinstance(y, (pd.DataFrame, pd.Series)):
            y = y.values
        
        X_train_cv, y_train_cv = X[train_indices], y[train_indices]
        X_val_cv, y_val_cv = X[val_indices], y[val_indices]
        
        model.fit(X_train_cv, y_train_cv)
        y_pred = model.predict(X_val_cv)
        
        mse = mean_squared_error(y_val_cv, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_val_cv, y_pred)
        
        scores['mse'].append(mse)
        scores['rmse'].append(rmse)
        scores['r2'].append(r2)
        
    avg_scores = {metric: np.mean(values) for metric, values in scores.items()}
    return avg_scores

In [19]:
# Import necessary libraries
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import itertools

# Initialize variables to store the best results and all tuning outcomes for Ridge
ridge_best_score = -np.inf
ridge_best_mse = np.inf
ridge_best_params = {}
ridge_results = []

# Quá trình tuning tham số Ridge Regression
for fit_intercept in param_grids['Ridge']['fit_intercept']:
    for normalize in [True, False]:  # Giờ sử dụng chuẩn hóa với StandardScaler
        # Sử dụng pipeline với StandardScaler và Ridge
        if normalize:
            ridge_model = make_pipeline(StandardScaler(), Ridge(fit_intercept=fit_intercept))
        else:
            ridge_model = Ridge(fit_intercept=fit_intercept)
        
        # Đánh giá mô hình với k-fold cross-validation
        scores = cross_validate_regression(ridge_model, X_train, y_train, k=5)
        
        avg_r2 = scores['r2']
        avg_mse = scores['mse']
        
        # Lưu kết quả vào danh sách
        ridge_results.append({
            'fit_intercept': fit_intercept,
            'normalize': normalize,
            'R2': avg_r2,
            'MSE': avg_mse
        })
        
        # Cập nhật tham số tốt nhất
        if avg_r2 > ridge_best_score and avg_mse < ridge_best_mse:
            ridge_best_score = avg_r2
            ridge_best_mse = avg_mse
            ridge_best_params = {
                'fit_intercept': fit_intercept,
                'normalize': normalize
            }

# In tất cả các kết quả tuning
print("\nAll Ridge parameter tuning results:")
for result in ridge_results:
    print(f"fit_intercept = {result['fit_intercept']}, normalize = {result['normalize']}, R^2 = {result['R2']}, MSE = {result['MSE']}")

# In báo cáo quá trình tuning
print("\nFine-Tuning Ridge Completed!")
print(f"Best parameter: {ridge_best_params}")
print(f"Best R^2: {ridge_best_score}")
print(f"Best MSE: {ridge_best_mse}")

# Huấn luyện lại mô hình với tham số tốt nhất trên toàn bộ tập huấn luyện
if ridge_best_params['normalize']:
    ridge_best_model = make_pipeline(StandardScaler(), Ridge(fit_intercept=ridge_best_params['fit_intercept']))
else:
    ridge_best_model = Ridge(fit_intercept=ridge_best_params['fit_intercept'])

ridge_best_model.fit(X_train, y_train)

# Dự đoán trên tập kiểm tra và tính toán các chỉ số R², MSE, RMSE
ridge_y_pred_test = ridge_best_model.predict(X_test)
ridge_test_r2 = r2_score(y_test, ridge_y_pred_test)
ridge_test_mse = mean_squared_error(y_test, ridge_y_pred_test)
ridge_test_rmse = np.sqrt(ridge_test_mse)

# In kết quả trên tập kiểm tra
print("\nRidge Test Results:")
print(f"R^2: {ridge_test_r2}")
print(f"MSE: {ridge_test_mse}")
print(f"RMSE: {ridge_test_rmse}")



Tất cả các kết quả tuning tham số Ridge Regression:
fit_intercept = True, normalize = True, R² = -0.040943208198915504, MSE = 279.9311825045913
fit_intercept = True, normalize = False, R² = 0.1715295725415565, MSE = 222.7177978116731
fit_intercept = False, normalize = True, R² = -19.44473509642173, MSE = 5481.023646644213
fit_intercept = False, normalize = False, R² = -0.6372386033474914, MSE = 439.9111926524962

Quá trình tuning tham số Ridge Regression đã hoàn thành!
Tham số tốt nhất: {'fit_intercept': True, 'normalize': False}
Điểm R² tốt nhất (CV): 0.1715295725415565
MSE tốt nhất (CV): 222.7177978116731

Kết quả trên tập kiểm tra với tham số tốt nhất:
Điểm R² trên tập kiểm tra: 0.1812880254403204
MSE trên tập kiểm tra: 224.76617575542244
RMSE trên tập kiểm tra: 14.992203832506496


In [None]:
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# Initialize variables to store the best results and all tuning outcomes
lasso_best_score = -np.inf
lasso_best_mse = np.inf
lasso_best_params = {}
lasso_results = []

# Fine-tuning Lasso Regression
for alpha in param_grids['Lasso']['alpha']:
    # Sử dụng pipeline với StandardScaler và Lasso
    model = make_pipeline(StandardScaler(), Lasso(alpha=alpha, random_state=18))
    
    # Đánh giá mô hình với k-fold cross-validation
    scores = cross_validate_regression(model, X_train, y_train, k=5)
    
    avg_r2 = scores['r2']
    avg_mse = scores['mse']
    
    # Lưu kết quả vào danh sách
    lasso_results.append({
        'alpha': alpha,
        'R2': avg_r2,
        'MSE': avg_mse
    })
    
    # Cập nhật tham số tốt nhất
    if avg_r2 > lasso_best_score and avg_mse < lasso_best_mse:
        lasso_best_score = avg_r2
        lasso_best_mse = avg_mse
        lasso_best_params = {'alpha': alpha}

# In tất cả các kết quả tuning
print("\nAll Lasso parameter tuning results:")
for result in lasso_results:
    print(f"alpha = {result['alpha']}, R^2 = {result['R2']}, MSE = {result['MSE']}")

# In báo cáo quá trình tuning
print("\nFine-Tuning Lasso Completed!")
print(f"Best parameter: {lasso_best_params}")
print(f"Best R^2: {lasso_best_score}")
print(f"Best MSE: {lasso_best_mse}")

# Huấn luyện lại mô hình với tham số tốt nhất trên toàn bộ tập huấn luyện
best_lasso_model = make_pipeline(StandardScaler(), Lasso(alpha=lasso_best_params['alpha'], random_state=18))
best_lasso_model.fit(X_train, y_train)

# Dự đoán trên tập kiểm tra và tính toán các chỉ số R², MSE, RMSE
lasso_y_pred_test = best_lasso_model.predict(X_test)
lasso_test_r2 = r2_score(y_test, lasso_y_pred_test)
lasso_test_mse = mean_squared_error(y_test, lasso_y_pred_test)
lasso_test_rmse = np.sqrt(lasso_test_mse)

# In kết quả trên tập kiểm tra
print("\nLasso Test Results:")
print(f"R^2: {lasso_test_r2}")
print(f"MSE: {lasso_test_mse}")
print(f"RMSE: {lasso_test_rmse}")


In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Initialize variables to store the best results and all tuning outcomes
xgb_best_score = -np.inf
xgb_best_mse = np.inf
xgb_best_params = {}
xgb_results = []

# Fine-tuning XGBoost with parameter grid
for n_estimators, learning_rate, max_depth, subsample, colsample_bytree, reg_alpha, reg_lambda in itertools.product(
    param_grids['xgboost']['n_estimators'],
    param_grids['xgboost']['learning_rate'],
    param_grids['xgboost']['max_depth'],
    param_grids['xgboost']['subsample'],
    param_grids['xgboost']['colsample_bytree'],
    param_grids['xgboost']['reg_alpha'],
    param_grids['xgboost']['reg_lambda']
):
    # Create XGBoost Regressor model with the parameters
    model = make_pipeline(
        StandardScaler(),  # Normalize data
        XGBRegressor(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            reg_alpha=reg_alpha,
            reg_lambda=reg_lambda,
            objective='reg:squarederror',
            verbosity=0
        )
    )
    
    # Cross-validate the model
    scores = cross_validate_regression(model, X_train, y_train, k=5)
    avg_r2 = scores['r2']
    avg_mse = scores['mse']
    
    # Append results to the list
    xgb_results.append({
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'R2': avg_r2,
        'MSE': avg_mse
    })
    
    # Update the best model parameters if current R² is higher
    if avg_r2 > xgb_best_score and avg_mse < xgb_best_mse:
        xgb_best_score = avg_r2
        xgb_best_mse = avg_mse
        xgb_best_params = {
            'n_estimators': n_estimators,
            'learning_rate': learning_rate,
            'max_depth': max_depth,
            'subsample': subsample,
            'colsample_bytree': colsample_bytree,
            'reg_alpha': reg_alpha,
            'reg_lambda': reg_lambda
        }

# Print all tuning results
print("\nAll XGBoost parameter tuning results:")
for result in xgb_results:
    print(f"n_estimators = {result['n_estimators']}, learning_rate = {result['learning_rate']}, "
          f"max_depth = {result['max_depth']}, subsample = {result['subsample']}, "
          f"colsample_bytree = {result['colsample_bytree']}, reg_alpha = {result['reg_alpha']}, "
          f"reg_lambda = {result['reg_lambda']}, R² = {result['R2']}, MSE = {result['MSE']}")

# Fine-tuning report
print("\nFine-Tuning XGBoost Completed!")
print(f"Best Parameters: {xgb_best_params}")
print(f"Best R²: {xgb_best_score}")
print(f"Best MSE: {xgb_best_mse}")

# Train the best XGBoost model on the full training data
best_xgb_model = make_pipeline(
    StandardScaler(),
    XGBRegressor(
        n_estimators=xgb_best_params['n_estimators'],
        learning_rate=xgb_best_params['learning_rate'],
        max_depth=xgb_best_params['max_depth'],
        subsample=xgb_best_params['subsample'],
        colsample_bytree=xgb_best_params['colsample_bytree'],
        reg_alpha=xgb_best_params['reg_alpha'],
        reg_lambda=xgb_best_params['reg_lambda'],
        objective='reg:squarederror',
        verbosity=0
    )
)

best_xgb_model.fit(X_train, y_train)

# Predict on the test set and calculate R², MSE
xgb_y_pred_test = best_xgb_model.predict(X_test)
xgb_test_r2 = r2_score(y_test, xgb_y_pred_test)
xgb_test_mse = mean_squared_error(y_test, xgb_y_pred_test)
xgb_test_rmse = np.sqrt(xgb_test_mse)

# Print test results
print("\nXGBoost Test Results:")
print(f"R²: {xgb_test_r2}")
print(f"MSE: {xgb_test_mse}")
print(f"RMSE: {xgb_test_rmse}")


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import itertools

# Initialize variables to store the best results and all tuning outcomes
dt_best_score = -np.inf
dt_best_mse = np.inf
dt_best_params = {}
dt_results = []

# Fine-tuning Decision Tree with parameter grid
for max_depth, min_samples_split, min_samples_leaf, max_features in itertools.product(
    param_grids['decision_tree']['max_depth'],
    param_grids['decision_tree']['min_samples_split'],
    param_grids['decision_tree']['min_samples_leaf'],
    param_grids['decision_tree']['max_features']
):
    # Create Decision Tree Regressor model
    model = make_pipeline(
        StandardScaler(),  # Normalize data
        DecisionTreeRegressor(
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            random_state=18
        )
    )

    # Cross-validate the model
    scores = cross_validate_regression(model, X_train, y_train, k=5)
    avg_r2 = scores['r2']
    avg_mse = scores['mse']

    # Append results to the list
    dt_results.append({
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features,
        'R2': avg_r2,
        'MSE': avg_mse
    })

    # Update the best model parameters if current R² is higher
    if avg_r2 > dt_best_score and avg_mse < dt_best_mse:
        dt_best_score = avg_r2
        dt_best_mse = avg_mse
        dt_best_params = {
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'max_features': max_features
        }

# Print all tuning results
print("\nAll Decision Tree parameter tuning results:")
for result in dt_results:
    print(f"max_depth = {result['max_depth']}, min_samples_split = {result['min_samples_split']}, "
          f"min_samples_leaf = {result['min_samples_leaf']}, max_features = {result['max_features']}, "
          f"R² = {result['R2']}, MSE = {result['MSE']}")

# Fine-tuning report
print("\nFine-Tuning Decision Tree Completed!")
print(f"Best Parameters: {dt_best_params}")
print(f"Best R²: {dt_best_score}")
print(f"Best MSE: {dt_best_mse}")

# Train the best Decision Tree model on the full training data
best_dt_model = make_pipeline(
    StandardScaler(),
    DecisionTreeRegressor(
        max_depth=dt_best_params['max_depth'],
        min_samples_split=dt_best_params['min_samples_split'],
        min_samples_leaf=dt_best_params['min_samples_leaf'],
        max_features=dt_best_params['max_features'],
        random_state=18
    )
)

best_dt_model.fit(X_train, y_train)

# Predict on the test set and calculate R², MSE
dt_y_pred_test = best_dt_model.predict(X_test)
dt_test_r2 = r2_score(y_test, dt_y_pred_test)
dt_test_mse = mean_squared_error(y_test, dt_y_pred_test)
dt_test_rmse = np.sqrt(dt_test_mse)

# Print test results
print("\nDecision Tree Test Results:")
print(f"R²: {dt_test_r2}")
print(f"MSE: {dt_test_mse}")
print(f"RMSE: {dt_test_rmse}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Lưu các kết quả của 3 mô hình vào một dictionary
results = {
    'Model': ['Ridge', 'Lasso', 'xgboost', 'decision_tree'],
    'R²': [ridge_test_r2, lasso_test_r2, xgb_test_r2, dt_test_r2],
    'MSE': [ridge_test_mse, lasso_test_mse, xgb_test_mse, dt_test_mse],
    'RMSE': [ridge_test_rmse, lasso_test_rmse, xgb_test_rmse, dt_test_rmse]
}

# Chọn mô hình tốt nhất (Dựa trên MSE hoặc R² cao nhất)
best_model_index = np.argmin(results['MSE'])  # Chọn mô hình có MSE thấp nhất
best_model_name = results['Model'][best_model_index]
best_model_r2 = results['R²'][best_model_index]
best_model_mse = results['MSE'][best_model_index]
best_model_rmse = results['RMSE'][best_model_index]

# In kết quả mô hình tốt nhất
print(f"Best Model: {best_model_name}")
print(f"Best Model R²: {best_model_r2}")
print(f"Best Model MSE: {best_model_mse}")
print(f"Best Model RMSE: {best_model_rmse}")

# Vẽ biểu đồ so sánh MSE, RMSE và R² của các mô hình
fig, ax = plt.subplots(1, 3, figsize=(20, 6))

# Màu sắc để làm nổi bật mô hình tốt nhất
colors = ['lightblue' if i != best_model_index else 'lightgreen' for i in range(len(results['Model']))]

# Biểu đồ MSE
sns.barplot(x=results['Model'], y=results['MSE'], ax=ax[0], palette=colors)
ax[0].set_title('Mean Squared Error (MSE)', fontsize=14)
ax[0].set_xlabel('Model', fontsize=12)
ax[0].set_ylabel('MSE', fontsize=12)
ax[0].bar_label(ax[0].containers[0], fmt='%.2f', fontsize=10)

# Biểu đồ RMSE
sns.barplot(x=results['Model'], y=results['RMSE'], ax=ax[1], palette=colors)
ax[1].set_title('Root Mean Squared Error (RMSE)', fontsize=14)
ax[1].set_xlabel('Model', fontsize=12)
ax[1].set_ylabel('RMSE', fontsize=12)
ax[1].bar_label(ax[1].containers[0], fmt='%.2f', fontsize=10)

# Biểu đồ R²
sns.barplot(x=results['Model'], y=results['R²'], ax=ax[2], palette=colors)
ax[2].set_title('R² Score', fontsize=14)
ax[2].set_xlabel('Model', fontsize=12)
ax[2].set_ylabel('R²', fontsize=12)
ax[2].bar_label(ax[2].containers[0], fmt='%.2f', fontsize=10)

# Làm nổi bật mô hình tốt nhất
for a in ax:
    a.axhline(0, color='black', linewidth=0.8, linestyle='--')
    a.tick_params(axis='x', labelrotation=15)

plt.suptitle('Comparison of Model Performance', fontsize=16, fontweight='bold')
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# In bảng kết quả để so sánh
print("\nComparison of Models:")
for model, mse, rmse, r2 in zip(results['Model'], results['MSE'], results['RMSE'], results['R²']):
    print(f"{model} - MSE: {mse:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")
