In [8]:
# Cell 1,2: Data Preprocessing

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

# Load the cleaned dataset
data = pd.read_csv('cleaned_data.csv')

# Create 'is_paid' feature: 0 for free, 1 for paid
data['is_paid'] = data['Pricing'].apply(lambda x: 0 if x == 0.0 else 1)

# Extract 'Month' from 'Release Date'
data['Release Date'] = pd.to_datetime(data['Release Date'], errors='coerce')
data['Release Month'] = data['Release Date'].dt.month

# Filter to only paid games (is_paid == 1)
data_paid = data[data['is_paid'] == 1].reset_index(drop=True)

# Select features and target
features = ['Game Genre', 'Developer', 'Release Month', 'Pricing']
target = 'Rating'

X = data_paid[features]
y = data_paid[target]

# Define categorical and numerical features
categorical_features = ['Game Genre', 'Developer']
numerical_features = ['Release Month', 'Pricing']

# Preprocessing pipelines for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)

# Apply preprocessing to the features
X_preprocessed = preprocessor.fit_transform(X)

# Convert the preprocessed features to a DataFrame
encoded_cat_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
encoded_num_features = numerical_features
all_features = list(encoded_cat_features) + encoded_num_features

X_preprocessed_df = pd.DataFrame(X_preprocessed.toarray(), columns=all_features)

# Cell 4: Split Paid Games Data

from sklearn.model_selection import train_test_split

# Split data into training and temporary sets (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_preprocessed_df, y, test_size=0.2, random_state=18
)

# Split temporary set into validation and test sets (50% each of temp -> 10% each of original)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=18
)

# Display the sizes of the splits
print('=== Paid Games Splits ===')
print(f'Training set size: {X_train.shape[0]} samples')
print(f'Validation set size: {X_valid.shape[0]} samples')
print(f'Test set size: {X_test.shape[0]} samples')


=== Paid Games Splits ===
Training set size: 32808 samples
Validation set size: 4101 samples
Test set size: 4101 samples


In [9]:
# Cell 3: Define Parameter Grids for Regression Models
import itertools
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

# Parameter grids for each model
param_grids = {
    'LinearRegression': {
        'fit_intercept': [True, False],
        'normalize': [True, False]
    },
    'PolynomialRegression': {
        'polynomialfeatures__degree': [2, 3, 4],
        'polynomialfeatures__include_bias': [False],
        'linearregression__fit_intercept': [True, False],
        'linearregression__normalize': [True, False]
    },
    'RandomForestRegressor': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    },
    'MLPRegressor': {
        'hidden_layer_sizes': [(50,), (100,), (100, 50)],
        'activation': ['tanh', 'relu'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001],
        'learning_rate': ['constant', 'adaptive']
    }
}

In [10]:
# Cell 4: Define Cross-Validation Function for Regression
from sklearn.metrics import mean_squared_error, r2_score

def cross_validate_regression(model, X, y, k=5):
    fold_size = len(X) // k
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    scores = {'mse': [], 'rmse': [], 'r2': []}
    
    for fold in range(k):
        start = fold * fold_size
        end = start + fold_size if fold != k-1 else len(X)
        val_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])
        
        X_train_cv, y_train_cv = X[train_indices], y[train_indices]
        X_val_cv, y_val_cv = X[val_indices], y[val_indices]
        
        model.fit(X_train_cv, y_train_cv)
        y_pred = model.predict(X_val_cv)
        
        mse = mean_squared_error(y_val_cv, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_val_cv, y_pred)
        
        scores['mse'].append(mse)
        scores['rmse'].append(rmse)
        scores['r2'].append(r2)
        
    avg_scores = {metric: np.mean(values) for metric, values in scores.items()}
    return avg_scores

In [11]:
# Cell 5: Hyperparameter Tuning for Linear Regression

from sklearn.linear_model import LinearRegression
import itertools

best_lr_score = -np.inf
best_lr_params = {}
lr_results = []

# Updated to remove 'normalize' as it is deprecated
for fit_intercept in param_grids['LinearRegression']['fit_intercept']:
    model = LinearRegression(
        fit_intercept=fit_intercept
        # 'normalize' parameter removed
    )
    scores = cross_validate_regression(model, X_train, y_train, k=5)
    avg_r2 = scores['r2']
    lr_results.append({
        'fit_intercept': fit_intercept,
        'R2': avg_r2
    })
    if avg_r2 > best_lr_score:
        best_lr_score = avg_r2
        best_lr_params = {
            'fit_intercept': fit_intercept
        }

print('Best LinearRegression Params:', best_lr_params)
print('Best LinearRegression CV R2:', best_lr_score)

KeyError: "None of [Index([22360, 14755,  4851, 29398, 27381, 23040,  1611, 29109, 30680, 17118,\n       ...\n        7823,   657, 12932,  5455, 22938,  9876, 25378, 12481, 25907, 16966],\n      dtype='int32', length=26247)] are in the [columns]"

In [None]:
# Cell 6: Hyperparameter Tuning for Polynomial Regression
from sklearn.linear_model import LinearRegression

best_pr_score = -np.inf
best_pr_params = {}
pr_results = []

for degree, include_bias, fit_intercept, normalize in itertools.product(
    param_grids['PolynomialRegression']['polynomialfeatures__degree'],
    param_grids['PolynomialRegression']['polynomialfeatures__include_bias'],
    param_grids['PolynomialRegression']['linearregression__fit_intercept'],
    param_grids['PolynomialRegression']['linearregression__normalize']
):
    pipeline = Pipeline([
        ('polynomialfeatures', PolynomialFeatures(
            degree=degree,
            include_bias=include_bias
        )),
        ('linearregression', LinearRegression(
            fit_intercept=fit_intercept,
            normalize=normalize
        ))
    ])
    scores = cross_validate_regression(pipeline, X_train, y_train, k=5)
    avg_r2 = scores['r2']
    pr_results.append({
        'degree': degree,
        'include_bias': include_bias,
        'fit_intercept': fit_intercept,
        'normalize': normalize,
        'R2': avg_r2
    })
    if avg_r2 > best_pr_score:
        best_pr_score = avg_r2
        best_pr_params = {
            'degree': degree,
            'include_bias': include_bias,
            'fit_intercept': fit_intercept,
            'normalize': normalize
        }

print('Best PolynomialRegression Params:', best_pr_params)
print('Best PolynomialRegression CV R2:', best_pr_score)

In [None]:
# Cell 7: Hyperparameter Tuning for Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

best_rf_score = -np.inf
best_rf_params = {}
rf_results = []

for n_estimators, max_depth, min_samples_split in itertools.product(
    param_grids['RandomForestRegressor']['n_estimators'],
    param_grids['RandomForestRegressor']['max_depth'],
    param_grids['RandomForestRegressor']['min_samples_split']
):
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    scores = cross_validate_regression(model, X_train, y_train, k=5)
    avg_r2 = scores['r2']
    rf_results.append({
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'R2': avg_r2
    })
    if avg_r2 > best_rf_score:
        best_rf_score = avg_r2
        best_rf_params = {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split
        }

print('Best RandomForestRegressor Params:', best_rf_params)
print('Best RandomForestRegressor CV R2:', best_rf_score)

In [None]:
# Cell 8: Hyperparameter Tuning for MLP Regressor
from sklearn.neural_network import MLPRegressor

best_mlp_score = -np.inf
best_mlp_params = {}
mlp_results = []

for hidden_layer_sizes, activation, solver, alpha, learning_rate in itertools.product(
    param_grids['MLPRegressor']['hidden_layer_sizes'],
    param_grids['MLPRegressor']['activation'],
    param_grids['MLPRegressor']['solver'],
    param_grids['MLPRegressor']['alpha'],
    param_grids['MLPRegressor']['learning_rate']
):
    model = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate=learning_rate,
        max_iter=500,
        random_state=42
    )
    try:
        scores = cross_validate_regression(model, X_train, y_train, k=5)
        avg_r2 = scores['r2']
        mlp_results.append({
            'hidden_layer_sizes': hidden_layer_sizes,
            'activation': activation,
            'solver': solver,
            'alpha': alpha,
            'learning_rate': learning_rate,
            'R2': avg_r2
        })
        if avg_r2 > best_mlp_score:
            best_mlp_score = avg_r2
            best_mlp_params = {
                'hidden_layer_sizes': hidden_layer_sizes,
                'activation': activation,
                'solver': solver,
                'alpha': alpha,
                'learning_rate': learning_rate
            }
    except Exception as e:
        print(f'Error with params {hidden_layer_sizes, activation, solver, alpha, learning_rate}: {e}')
        continue

print('Best MLPRegressor Params:', best_mlp_params)
print('Best MLPRegressor CV R2:', best_mlp_score)

In [None]:
# Cell 9: Train Best Models on Training Set and Evaluate on Test Set
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Linear Regression
best_lr = LinearRegression(
    fit_intercept=best_lr_params['fit_intercept'],
    normalize=best_lr_params['normalize']
)
best_lr.fit(X_train, y_train)
lr_pred = best_lr.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_pred)
lr_rmse = np.sqrt(lr_mse)
lr_r2 = r2_score(y_test, lr_pred)

print('LinearRegression Test MSE:', lr_mse)
print('LinearRegression Test RMSE:', lr_rmse)
print('LinearRegression Test R2:', lr_r2)

# Polynomial Regression
best_pr = Pipeline([
    ('polynomialfeatures', PolynomialFeatures(
        degree=best_pr_params['degree'],
        include_bias=best_pr_params['include_bias']
    )),
    ('linearregression', LinearRegression(
        fit_intercept=best_pr_params['fit_intercept'],
        normalize=best_pr_params['normalize']
    ))
])
best_pr.fit(X_train, y_train)
pr_pred = best_pr.predict(X_test)
pr_mse = mean_squared_error(y_test, pr_pred)
pr_rmse = np.sqrt(pr_mse)
pr_r2 = r2_score(y_test, pr_pred)

print('PolynomialRegression Test MSE:', pr_mse)
print('PolynomialRegression Test RMSE:', pr_rmse)
print('PolynomialRegression Test R2:', pr_r2)

# Random Forest Regressor
best_rf = RandomForestRegressor(
    n_estimators=best_rf_params['n_estimators'],
    max_depth=best_rf_params['max_depth'],
    min_samples_split=best_rf_params['min_samples_split'],
    random_state=42
)
best_rf.fit(X_train, y_train)
rf_pred = best_rf.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, rf_pred)

print('RandomForestRegressor Test MSE:', rf_mse)
print('RandomForestRegressor Test RMSE:', rf_rmse)
print('RandomForestRegressor Test R2:', rf_r2)

# MLP Regressor
best_mlp = MLPRegressor(
    hidden_layer_sizes=best_mlp_params['hidden_layer_sizes'],
    activation=best_mlp_params['activation'],
    solver=best_mlp_params['solver'],
    alpha=best_mlp_params['alpha'],
    learning_rate=best_mlp_params['learning_rate'],
    max_iter=500,
    random_state=42
)
best_mlp.fit(X_train, y_train)
mlp_pred = best_mlp.predict(X_test)
mlp_mse = mean_squared_error(y_test, mlp_pred)
mlp_rmse = np.sqrt(mlp_mse)
mlp_r2 = r2_score(y_test, mlp_pred)

print('MLPRegressor Test MSE:', mlp_mse)
print('MLPRegressor Test RMSE:', mlp_rmse)
print('MLPRegressor Test R2:', mlp_r2)

In [None]:
# Cell 10: Compare Model Performances
import pandas as pd

performance = pd.DataFrame({
    'Model': ['LinearRegression', 'PolynomialRegression', 'RandomForestRegressor', 'MLPRegressor'],
    'MSE': [lr_mse, pr_mse, rf_mse, mlp_mse],
    'RMSE': [lr_rmse, pr_rmse, rf_rmse, mlp_rmse],
    'R2': [lr_r2, pr_r2, rf_r2, mlp_r2]
})

print('Model Comparison:')
print(performance)

In [None]:
# Cell 11: Report Fine-Tuning Process and Model Performances

print("=== Hyperparameter Tuning Results ===\n")

print("1. **Linear Regression**")
print(f"   - Best Parameters: {best_lr_params}")
print(f"   - Best CV R² Score: {best_lr_score}\n")

print("2. **Polynomial Regression**")
print(f"   - Best Parameters: {best_pr_params}")
print(f"   - Best CV R² Score: {best_pr_score}\n")

print("3. **Random Forest Regressor**")
print(f"   - Best Parameters: {best_rf_params}")
print(f"   - Best CV R² Score: {best_rf_score}\n")

print("4. **MLP Regressor**")
print(f"   - Best Parameters: {best_mlp_params}")
print(f"   - Best CV R² Score: {best_mlp_score}\n")

print("=== Model Performance on Test Set ===\n")

performance = pd.DataFrame({
    'Model': ['Linear Regression', 'Polynomial Regression', 'Random Forest Regressor', 'MLP Regressor'],
    'MSE': [lr_mse, pr_mse, rf_mse, mlp_mse],
    'RMSE': [lr_rmse, pr_rmse, rf_rmse, mlp_rmse],
    'R²': [lr_r2, pr_r2, rf_r2, mlp_r2]
})

print(performance.to_markdown(index=False))