In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Load the cleaned dataset
data = pd.read_csv('cleaned_data.csv')

# Create 'is_paid' feature: 0 for free, 1 for paid
data['is_paid'] = data['Pricing'].apply(lambda x: 0 if x == 0.0 else 1)

# Extract 'Month' from 'Release Date'
data['Release Date'] = pd.to_datetime(data['Release Date'], errors='coerce')
data['Release Month'] = data['Release Date'].dt.month

# Filter only free games (is_paid == 0)
data_free = data[data['is_paid'] == 0]

# Select features and target, excluding 'Pricing'
features = ['Game Genre', 'Developer', 'Release Month']
target = 'Rating'

X = data_free[features]
y = data_free[target]

# Reset the index of y to align with X
y = y.reset_index(drop=True)

# Define categorical and numerical features
categorical_features = ['Game Genre', 'Developer']
numerical_features = ['Release Month']

# Preprocessing pipelines for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)

# Apply preprocessing to the features
X_preprocessed = preprocessor.fit_transform(X)

# Convert the preprocessed features to a DataFrame
encoded_cat_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
encoded_num_features = numerical_features
all_features = list(encoded_cat_features) + encoded_num_features

X_preprocessed_df = pd.DataFrame(X_preprocessed.toarray(), columns=all_features)

# Reset the index of X_preprocessed_df to align with y
X_preprocessed_df = X_preprocessed_df.reset_index(drop=True)

# Split data into training and temporary sets (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_preprocessed_df, y, test_size=0.2, random_state=18
)

# Split temporary set into validation and test sets (50% each of temp -> 10% each of original)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=18
)

# Display the sizes of the splits
print(f'Training set size: {X_train.shape[0]} samples')
print(f'Validation set size: {X_valid.shape[0]} samples')
print(f'Test set size: {X_test.shape[0]} samples')


Training set size: 16585 samples
Validation set size: 2073 samples
Test set size: 2074 samples


In [10]:
# Cell 3: Define Parameter Grids for Regression Models
import itertools
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

# Parameter grids for each model
param_grids = {
    'LinearRegression': {
        'fit_intercept': [True, False]
    },
    'PolynomialRegression': {
        'polynomialfeatures__degree': [2, 3, 4],
        'polynomialfeatures__include_bias': [False],
        'linearregression__fit_intercept': [True, False],
        'linearregression__normalize': [True, False]
    },
    'RandomForestRegressor': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    },
    'MLPRegressor': {
        'hidden_layer_sizes': [(50,), (100,), (100, 50)],
        'activation': ['tanh', 'relu'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001],
        'learning_rate': ['constant', 'adaptive'],
        'max_iter': [200, 300, 500]
    }
}

In [11]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import pandas as pd

# Hàm cross-validation cho hồi quy
def cross_validate_regression(model, X, y, k=5):
    fold_size = len(X) // k
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    scores = {'mse': [], 'rmse': [], 'r2': []}
    
    for fold in range(k):
        start = fold * fold_size
        end = start + fold_size if fold != k-1 else len(X)
        val_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])
        
        # Convert X và y thành numpy arrays nếu là pandas DataFrame hoặc Series
        if isinstance(X, (pd.DataFrame, pd.Series)):
            X = X.values
        if isinstance(y, (pd.DataFrame, pd.Series)):
            y = y.values
        
        X_train_cv, y_train_cv = X[train_indices], y[train_indices]
        X_val_cv, y_val_cv = X[val_indices], y[val_indices]
        
        model.fit(X_train_cv, y_train_cv)
        y_pred = model.predict(X_val_cv)
        
        mse = mean_squared_error(y_val_cv, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_val_cv, y_pred)
        
        scores['mse'].append(mse)
        scores['rmse'].append(rmse)
        scores['r2'].append(r2)
        
    avg_scores = {metric: np.mean(values) for metric, values in scores.items()}
    return avg_scores

In [None]:
# Cell 5: Hyperparameter Tuning for Linear Regression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
import itertools
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score


# Khởi tạo các biến để lưu kết quả
best_lr_score = -np.inf
best_lr_mse = np.inf
best_lr_params = {}
lr_results = []

# Quá trình tuning tham số LinearRegression
for fit_intercept in param_grids['LinearRegression']['fit_intercept']:
    for normalize in [True, False]:  # Giờ sử dụng chuẩn hóa với StandardScaler
        # Sử dụng pipeline với StandardScaler và LinearRegression
        if normalize:
            model = make_pipeline(StandardScaler(), LinearRegression(fit_intercept=fit_intercept))
        else:
            model = LinearRegression(fit_intercept=fit_intercept)
        
        # Đánh giá mô hình với k-fold cross-validation
        scores = cross_validate_regression(model, X_train, y_train, k=5)
        
        avg_r2 = scores['r2']
        avg_mse = scores['mse']
        
        # Lưu kết quả vào danh sách
        lr_results.append({
            'fit_intercept': fit_intercept,
            'normalize': normalize,
            'R2': avg_r2,
            'MSE': avg_mse
        })
        
        # Cập nhật tham số tốt nhất
        if avg_r2 > best_lr_score and avg_mse < best_lr_mse:
            best_lr_score = avg_r2
            best_lr_mse = avg_mse
            best_lr_params = {
                'fit_intercept': fit_intercept,
                'normalize': normalize
            }

# In tất cả các kết quả tuning
print("\nTất cả các kết quả tuning tham số LinearRegression:")
for result in lr_results:
    print(f"fit_intercept = {result['fit_intercept']}, normalize = {result['normalize']}, R² = {result['R2']}, MSE = {result['MSE']}")

# In báo cáo quá trình tuning
print("\nQuá trình tuning tham số LinearRegression đã hoàn thành!")
print(f"Tham số tốt nhất: {best_lr_params}")
print(f"Điểm R² tốt nhất (CV): {best_lr_score}")
print(f"MSE tốt nhất (CV): {best_lr_mse}")

# Huấn luyện lại mô hình với tham số tốt nhất trên toàn bộ tập huấn luyện
if best_lr_params['normalize']:
    best_model = make_pipeline(StandardScaler(), LinearRegression(fit_intercept=best_lr_params['fit_intercept']))
else:
    best_model = LinearRegression(fit_intercept=best_lr_params['fit_intercept'])

best_model.fit(X_train, y_train)

# Dự đoán trên tập kiểm tra và tính toán các chỉ số R², MSE, RMSE
y_pred_test = best_model.predict(X_test)
test_r2 = r2_score(y_test, y_pred_test)
test_mse = mean_squared_error(y_test, y_pred_test)
test_rmse = np.sqrt(test_mse)

# In kết quả trên tập kiểm tra
print("\nKết quả trên tập kiểm tra với tham số tốt nhất:")
print(f"Điểm R² trên tập kiểm tra: {test_r2}")
print(f"MSE trên tập kiểm tra: {test_mse}")
print(f"RMSE trên tập kiểm tra: {test_rmse}")

In [None]:
# Cell 6: Hyperparameter Tuning for Polynomial Regression
from sklearn.linear_model import LinearRegression

best_pr_score = -np.inf
best_pr_params = {}
pr_results = []

for degree, include_bias, fit_intercept, normalize in itertools.product(
    param_grids['PolynomialRegression']['polynomialfeatures__degree'],
    param_grids['PolynomialRegression']['polynomialfeatures__include_bias'],
    param_grids['PolynomialRegression']['linearregression__fit_intercept'],
    param_grids['PolynomialRegression']['linearregression__normalize']
):
    pipeline = Pipeline([
        ('polynomialfeatures', PolynomialFeatures(
            degree=degree,
            include_bias=include_bias
        )),
        ('linearregression', LinearRegression(
            fit_intercept=fit_intercept,
            normalize=normalize
        ))
    ])
    scores = cross_validate_regression(pipeline, X_train, y_train, k=5)
    avg_r2 = scores['r2']
    pr_results.append({
        'degree': degree,
        'include_bias': include_bias,
        'fit_intercept': fit_intercept,
        'normalize': normalize,
        'R2': avg_r2
    })
    if avg_r2 > best_pr_score:
        best_pr_score = avg_r2
        best_pr_params = {
            'degree': degree,
            'include_bias': include_bias,
            'fit_intercept': fit_intercept,
            'normalize': normalize
        }

print('Best PolynomialRegression Params:', best_pr_params)
print('Best PolynomialRegression CV R2:', best_pr_score)

In [None]:
# Cell 7: Hyperparameter Tuning for Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

best_rf_score = -np.inf
best_rf_params = {}
rf_results = []

for n_estimators, max_depth, min_samples_split in itertools.product(
    param_grids['RandomForestRegressor']['n_estimators'],
    param_grids['RandomForestRegressor']['max_depth'],
    param_grids['RandomForestRegressor']['min_samples_split']
):
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    scores = cross_validate_regression(model, X_train, y_train, k=5)
    avg_r2 = scores['r2']
    rf_results.append({
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'R2': avg_r2
    })
    if avg_r2 > best_rf_score:
        best_rf_score = avg_r2
        best_rf_params = {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split
        }

print('Best RandomForestRegressor Params:', best_rf_params)
print('Best RandomForestRegressor CV R2:', best_rf_score)

In [None]:
# Cell 8: Hyperparameter Tuning for MLP Regressor
from sklearn.neural_network import MLPRegressor

best_mlp_score = -np.inf
best_mlp_params = {}
mlp_results = []

for hidden_layer_sizes, activation, solver, alpha, learning_rate in itertools.product(
    param_grids['MLPRegressor']['hidden_layer_sizes'],
    param_grids['MLPRegressor']['activation'],
    param_grids['MLPRegressor']['solver'],
    param_grids['MLPRegressor']['alpha'],
    param_grids['MLPRegressor']['learning_rate']
):
    model = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate=learning_rate,
        max_iter=500,
        random_state=42
    )
    try:
        scores = cross_validate_regression(model, X_train, y_train, k=5)
        avg_r2 = scores['r2']
        mlp_results.append({
            'hidden_layer_sizes': hidden_layer_sizes,
            'activation': activation,
            'solver': solver,
            'alpha': alpha,
            'learning_rate': learning_rate,
            'R2': avg_r2
        })
        if avg_r2 > best_mlp_score:
            best_mlp_score = avg_r2
            best_mlp_params = {
                'hidden_layer_sizes': hidden_layer_sizes,
                'activation': activation,
                'solver': solver,
                'alpha': alpha,
                'learning_rate': learning_rate
            }
    except Exception as e:
        print(f'Error with params {hidden_layer_sizes, activation, solver, alpha, learning_rate}: {e}')
        continue

print('Best MLPRegressor Params:', best_mlp_params)
print('Best MLPRegressor CV R2:', best_mlp_score)

In [None]:
# Cell 9: Train Best Models on Training Set and Evaluate on Test Set
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Linear Regression
best_lr = LinearRegression(
    fit_intercept=best_lr_params['fit_intercept'],
    normalize=best_lr_params['normalize']
)
best_lr.fit(X_train, y_train)
lr_pred = best_lr.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_pred)
lr_rmse = np.sqrt(lr_mse)
lr_r2 = r2_score(y_test, lr_pred)

print('LinearRegression Test MSE:', lr_mse)
print('LinearRegression Test RMSE:', lr_rmse)
print('LinearRegression Test R2:', lr_r2)

# Polynomial Regression
best_pr = Pipeline([
    ('polynomialfeatures', PolynomialFeatures(
        degree=best_pr_params['degree'],
        include_bias=best_pr_params['include_bias']
    )),
    ('linearregression', LinearRegression(
        fit_intercept=best_pr_params['fit_intercept'],
        normalize=best_pr_params['normalize']
    ))
])
best_pr.fit(X_train, y_train)
pr_pred = best_pr.predict(X_test)
pr_mse = mean_squared_error(y_test, pr_pred)
pr_rmse = np.sqrt(pr_mse)
pr_r2 = r2_score(y_test, pr_pred)

print('PolynomialRegression Test MSE:', pr_mse)
print('PolynomialRegression Test RMSE:', pr_rmse)
print('PolynomialRegression Test R2:', pr_r2)

# Random Forest Regressor
best_rf = RandomForestRegressor(
    n_estimators=best_rf_params['n_estimators'],
    max_depth=best_rf_params['max_depth'],
    min_samples_split=best_rf_params['min_samples_split'],
    random_state=42
)
best_rf.fit(X_train, y_train)
rf_pred = best_rf.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, rf_pred)

print('RandomForestRegressor Test MSE:', rf_mse)
print('RandomForestRegressor Test RMSE:', rf_rmse)
print('RandomForestRegressor Test R2:', rf_r2)

# MLP Regressor
best_mlp = MLPRegressor(
    hidden_layer_sizes=best_mlp_params['hidden_layer_sizes'],
    activation=best_mlp_params['activation'],
    solver=best_mlp_params['solver'],
    alpha=best_mlp_params['alpha'],
    learning_rate=best_mlp_params['learning_rate'],
    max_iter=500,
    random_state=42
)
best_mlp.fit(X_train, y_train)
mlp_pred = best_mlp.predict(X_test)
mlp_mse = mean_squared_error(y_test, mlp_pred)
mlp_rmse = np.sqrt(mlp_mse)
mlp_r2 = r2_score(y_test, mlp_pred)

print('MLPRegressor Test MSE:', mlp_mse)
print('MLPRegressor Test RMSE:', mlp_rmse)
print('MLPRegressor Test R2:', mlp_r2)

In [None]:
# Cell 10: Compare Model Performances
import pandas as pd

performance = pd.DataFrame({
    'Model': ['LinearRegression', 'PolynomialRegression', 'RandomForestRegressor', 'MLPRegressor'],
    'MSE': [lr_mse, pr_mse, rf_mse, mlp_mse],
    'RMSE': [lr_rmse, pr_rmse, rf_rmse, mlp_rmse],
    'R2': [lr_r2, pr_r2, rf_r2, mlp_r2]
})

print('Model Comparison:')
print(performance)

In [None]:
# Cell 11: Report Fine-Tuning Process and Model Performances

print("=== Hyperparameter Tuning Results ===\n")

print("1. **Linear Regression**")
print(f"   - Best Parameters: {best_lr_params}")
print(f"   - Best CV R² Score: {best_lr_score}\n")

print("2. **Polynomial Regression**")
print(f"   - Best Parameters: {best_pr_params}")
print(f"   - Best CV R² Score: {best_pr_score}\n")

print("3. **Random Forest Regressor**")
print(f"   - Best Parameters: {best_rf_params}")
print(f"   - Best CV R² Score: {best_rf_score}\n")

print("4. **MLP Regressor**")
print(f"   - Best Parameters: {best_mlp_params}")
print(f"   - Best CV R² Score: {best_mlp_score}\n")

print("=== Model Performance on Test Set ===\n")

performance = pd.DataFrame({
    'Model': ['Linear Regression', 'Polynomial Regression', 'Random Forest Regressor', 'MLP Regressor'],
    'MSE': [lr_mse, pr_mse, rf_mse, mlp_mse],
    'RMSE': [lr_rmse, pr_rmse, rf_rmse, mlp_rmse],
    'R²': [lr_r2, pr_r2, rf_r2, mlp_r2]
})

print(performance.to_markdown(index=False))