<a href="https://colab.research.google.com/github/zeno1406/MachineLearning/blob/main/Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [62]:
#import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import randint as sp_randint, uniform as sp_uniform
import warnings
warnings.filterwarnings('ignore')

In [34]:
df = pd.read_csv('/content/drive/MyDrive/student-mat.csv')

In [35]:
features = df[['failures', 'freetime', 'Medu', 'higher', 'studytime', 'schoolsup','famrel', 'Fjob', 'Mjob', 'traveltime','higher', 'Walc', 'Dalc']].copy()
target = df.copy()['G1']

In [63]:
# Tạo ColumnTransformer để áp dụng OneHotEncoder chỉ cho cột 'higher'
categorical_features = ['failures', 'freetime', 'Medu', 'higher', 'studytime', 'schoolsup','famrel', 'Fjob', 'Mjob', 'traveltime','higher', 'Walc', 'Dalc'] # Danh sách các cột cần one-hot encoding
features_encoded = pd.get_dummies(features, columns=categorical_features)
# Check for duplicate columns
if features_encoded.columns.duplicated().any():
    print("Warning: Duplicate columns detected")
    features_encoded = features_encoded.loc[:, ~features_encoded.columns.duplicated()]
features_encoded = features_encoded.astype(int)
# Feature scaling
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_encoded)
print(features_encoded.head())

   failures_0  failures_1  failures_2  failures_3  freetime_1  freetime_2  \
0           1           0           0           0           0           0   
1           1           0           0           0           0           0   
2           0           0           0           1           0           0   
3           1           0           0           0           0           1   
4           1           0           0           0           0           0   

   freetime_3  freetime_4  freetime_5  Medu_0  ...  Walc_1  Walc_2  Walc_3  \
0           1           0           0       0  ...       1       0       0   
1           1           0           0       0  ...       1       0       0   
2           1           0           0       0  ...       0       0       1   
3           0           0           0       0  ...       1       0       0   
4           1           0           0       0  ...       0       1       0   

   Walc_4  Walc_5  Dalc_1  Dalc_2  Dalc_3  Dalc_4  Dalc_5  
0       

In [66]:
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

In [67]:
# Initializing the models
rf = RandomForestRegressor(random_state=42)
xgbr = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
catbr = CatBoostRegressor(verbose=0, random_state=42)

# Defining hyperparameter grids for RandomizedSearchCV
param_dist_rf = {
    'n_estimators': sp_randint(100, 500),
    'max_depth': sp_randint(10, 50),
    'min_samples_split': sp_randint(2, 20),
    'min_samples_leaf': sp_randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2']
}

param_dist_xgb = {
    'n_estimators': sp_randint(100, 500),
    'max_depth': sp_randint(3, 20),
    'learning_rate': sp_uniform(0.01, 0.3),
    'subsample': sp_uniform(0.6, 0.4),
    'colsample_bytree': sp_uniform(0.6, 0.4)
}

param_dist_cat = {
    'iterations': sp_randint(500, 2000),
    'depth': sp_randint(4, 10),
    'learning_rate': sp_uniform(0.01, 0.3),
    'l2_leaf_reg': sp_uniform(1, 10)
}

# RandomizedSearchCV for RandomForest
random_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_dist_rf, n_iter=100, cv=3, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)
random_rf.fit(X_train, y_train)
best_rf = random_rf.best_estimator_

# RandomizedSearchCV for XGBoost
random_xgb = RandomizedSearchCV(estimator=xgbr, param_distributions=param_dist_xgb, n_iter=100, cv=3, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)
random_xgb.fit(X_train, y_train)
best_xgb = random_xgb.best_estimator_

# RandomizedSearchCV for CatBoost
random_cat = RandomizedSearchCV(estimator=catbr, param_distributions=param_dist_cat, n_iter=100, cv=3, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)
random_cat.fit(X_train, y_train)
best_cat = random_cat.best_estimator_

# Making predictions
y_pred_rf = best_rf.predict(X_test)
y_pred_xgb = best_xgb.predict(X_test)
y_pred_cat = best_cat.predict(X_test)

# Evaluating the models
mse_rf = mean_squared_error(y_test, y_pred_rf)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
mse_cat = mean_squared_error(y_test, y_pred_cat)

print(f"RandomForest Mean Squared Error: {mse_rf}")
print(f"XGBoost Mean Squared Error: {mse_xgb}")
print(f"CatBoost Mean Squared Error: {mse_cat}")

RandomForest Mean Squared Error: 11.060507147798235
XGBoost Mean Squared Error: 11.221929391356307
CatBoost Mean Squared Error: 10.445787749600305


In [68]:
# Function to plot learning curves on the test set
def plot_learning_curve_on_test(estimator, X_train, y_train, X_test, y_test, title):
    train_sizes = np.linspace(0.1, 1.0, 5)
    train_scores = []
    test_scores = []

    for train_size in train_sizes:
        X_train_subset, _, y_train_subset, _ = train_test_split(X_train, y_train, train_size=train_size, random_state=42)
        estimator.fit(X_train_subset, y_train_subset)
        train_score = mean_squared_error(y_train_subset, estimator.predict(X_train_subset))
        test_score = mean_squared_error(y_test, estimator.predict(X_test))
        train_scores.append(train_score)
        test_scores.append(test_score)

    plt.figure()
    plt.plot(train_sizes, train_scores, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores, 'o-', color="g", label="Test score")
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Mean Squared Error")
    plt.legend(loc="best")
    plt.grid()
    plt.show()

# Plotting learning curves on the test set
plot_learning_curve_on_test(best_rf, X_train, y_train, X_test, y_test, "Learning Curve (RandomForest)")
plot_learning_curve_on_test(best_xgb, X_train, y_train, X_test, y_test, "Learning Curve (XGBoost)")
plot_learning_curve_on_test(best_cat, X_train, y_train, X_test, y_test, "Learning Curve (CatBoost)")

# Visualization for model comparison
models = ['RandomForest', 'XGBoost', 'CatBoost']
mses = [mse_rf, mse_xgb, mse_cat]

plt.figure(figsize=(10, 6))
plt.bar(models, mses, color=['blue', 'orange', 'green'])
plt.xlabel('Model')
plt.ylabel('Mean Squared Error')
plt.title('Model Comparison')
plt.show()

ValueError: train_size=1.0 should be either positive and smaller than the number of samples 316 or a float in the (0, 1) range