In [1]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_excel('/content/Mathbert (1).xlsx')

# Assume your dataset has all the embeddings as features and 'putting marks' as the target
X = df.drop(columns=['putting marks'])  # Features
y = df['putting marks']  # Target variable

# Split the data into training and testing sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

# Define the Gradient Boosting model
model = GradientBoostingRegressor(random_state=42)

# Set up k-fold cross-validation
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize lists to store metrics for each fold
mse_scores_train = []
rmse_scores_train = []
r2_scores_train = []
mape_scores_train = []

mse_scores_test = []
rmse_scores_test = []
r2_scores_test = []
mape_scores_test = []

# Perform k-fold cross-validation manually to compute metrics
for train_index, test_index in kf.split(X_train_full_scaled):
    X_train_fold, X_test_fold = X_train_full_scaled[train_index], X_train_full_scaled[test_index]
    y_train_fold, y_test_fold = y_train_full.iloc[train_index], y_train_full.iloc[test_index]

    # Fit the Gradient Boosting model on the training fold
    model.fit(X_train_fold, y_train_fold)

    # Predict on the training set and test set
    y_train_pred = model.predict(X_train_fold)
    y_test_pred = model.predict(X_test_fold)

    # Calculate metrics for training set
    mse_train = mean_squared_error(y_train_fold, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train_fold, y_train_pred)
    mape_train = mean_absolute_percentage_error(y_train_fold, y_train_pred) * 100

    # Calculate metrics for test set
    mse_test = mean_squared_error(y_test, model.predict(X_test_scaled))
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, model.predict(X_test_scaled))
    mape_test = mean_absolute_percentage_error(y_test, model.predict(X_test_scaled)) * 100

    # Store the metrics
    mse_scores_train.append(mse_train)
    rmse_scores_train.append(rmse_train)
    r2_scores_train.append(r2_train)
    mape_scores_train.append(mape_train)

    mse_scores_test.append(mse_test)
    rmse_scores_test.append(rmse_test)
    r2_scores_test.append(r2_test)
    mape_scores_test.append(mape_test)

# Print the metrics for training folds
print("Training Set Metrics:")
print(f"MSE scores for each fold: {mse_scores_train}")
print(f"Average MSE: {np.mean(mse_scores_train)}")
print(f"Standard Deviation of MSE: {np.std(mse_scores_train)}\n")

print(f"RMSE scores for each fold: {rmse_scores_train}")
print(f"Average RMSE: {np.mean(rmse_scores_train)}")
print(f"Standard Deviation of RMSE: {np.std(rmse_scores_train)}\n")

print(f"R2 scores for each fold: {r2_scores_train}")
print(f"Average R2: {np.mean(r2_scores_train)}")
print(f"Standard Deviation of R2: {np.std(r2_scores_train)}\n")

print(f"MAPE scores for each fold: {mape_scores_train}")
print(f"Average MAPE: {np.mean(mape_scores_train)}")
print(f"Standard Deviation of MAPE: {np.std(mape_scores_train)}\n")

# Print the metrics for the test set
print("Test Set Metrics:")
print(f"MSE: {mse_scores_test[-1]}")
print(f"RMSE: {rmse_scores_test[-1]}")
print(f"R2: {r2_scores_test[-1]}")
print(f"MAPE: {mape_scores_test[-1]}\n")

Training Set Metrics:
MSE scores for each fold: [0.1696674658683861, 0.1669805218789437, 0.20196922282223534, 0.17114507781120517, 0.16384988497534103, 0.1896200554742468, 0.14911592862894651, 0.18251846144226644, 0.1724747867481165, 0.1776755916346666]
Average MSE: 0.17450169972843543
Standard Deviation of MSE: 0.013822385161781449

RMSE scores for each fold: [0.41190710830038624, 0.40863250223023584, 0.4494098606197191, 0.41369684288281094, 0.40478375087859075, 0.4354538499935978, 0.3861553167171812, 0.42722179420327616, 0.41530083884831787, 0.4215158260785312]
Average RMSE: 0.41740776907526467
Standard Deviation of RMSE: 0.01650618199481346

R2 scores for each fold: [0.9540019352293204, 0.953993013136911, 0.943919538660406, 0.9536600721201043, 0.9562561930445139, 0.947964856487976, 0.9593466169529085, 0.9503579477875352, 0.952633511923874, 0.9514507087626743]
Average R2: 0.9523584394106225
Standard Deviation of R2: 0.004080618697662833

MAPE scores for each fold: [2.9325741608797936

In [5]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_excel('/content/Codebert (1).xlsx')

# Assume your dataset has all the embeddings as features and 'putting marks' as the target
X = df.drop(columns=['putting marks'])  # Features
y = df['putting marks']  # Target variable

# Split the data into training and testing sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

# Define the Gradient Boosting model
model = GradientBoostingRegressor(random_state=42)

# Set up k-fold cross-validation
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize lists to store metrics for each fold
mse_scores_train = []
rmse_scores_train = []
r2_scores_train = []
mape_scores_train = []

mse_scores_test = []
rmse_scores_test = []
r2_scores_test = []
mape_scores_test = []

# Perform k-fold cross-validation manually to compute metrics
for train_index, test_index in kf.split(X_train_full_scaled):
    X_train_fold, X_test_fold = X_train_full_scaled[train_index], X_train_full_scaled[test_index]
    y_train_fold, y_test_fold = y_train_full.iloc[train_index], y_train_full.iloc[test_index]

    # Fit the Gradient Boosting model on the training fold
    model.fit(X_train_fold, y_train_fold)

    # Predict on the training set and test set
    y_train_pred = model.predict(X_train_fold)
    y_test_pred = model.predict(X_test_fold)

    # Calculate metrics for training set
    mse_train = mean_squared_error(y_train_fold, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train_fold, y_train_pred)
    mape_train = mean_absolute_percentage_error(y_train_fold, y_train_pred) * 100

    # Calculate metrics for test set
    mse_test = mean_squared_error(y_test, model.predict(X_test_scaled))
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, model.predict(X_test_scaled))
    mape_test = mean_absolute_percentage_error(y_test, model.predict(X_test_scaled)) * 100

    # Store the metrics
    mse_scores_train.append(mse_train)
    rmse_scores_train.append(rmse_train)
    r2_scores_train.append(r2_train)
    mape_scores_train.append(mape_train)

    mse_scores_test.append(mse_test)
    rmse_scores_test.append(rmse_test)
    r2_scores_test.append(r2_test)
    mape_scores_test.append(mape_test)

# Print the metrics for training folds
print("Training Set Metrics:")
print(f"MSE scores for each fold: {mse_scores_train}")
print(f"Average MSE: {np.mean(mse_scores_train)}")
print(f"Standard Deviation of MSE: {np.std(mse_scores_train)}\n")

print(f"RMSE scores for each fold: {rmse_scores_train}")
print(f"Average RMSE: {np.mean(rmse_scores_train)}")
print(f"Standard Deviation of RMSE: {np.std(rmse_scores_train)}\n")

print(f"R2 scores for each fold: {r2_scores_train}")
print(f"Average R2: {np.mean(r2_scores_train)}")
print(f"Standard Deviation of R2: {np.std(r2_scores_train)}\n")

print(f"MAPE scores for each fold: {mape_scores_train}")
print(f"Average MAPE: {np.mean(mape_scores_train)}")
print(f"Standard Deviation of MAPE: {np.std(mape_scores_train)}\n")

# Print the metrics for the test set
print("Test Set Metrics:")
print(f"MSE: {mse_scores_test[-1]}")
print(f"RMSE: {rmse_scores_test[-1]}")
print(f"R2: {r2_scores_test[-1]}")
print(f"MAPE: {mape_scores_test[-1]}\n")

Training Set Metrics:
MSE scores for each fold: [0.15904549979698232, 0.1673228898610105, 0.1913634816379817, 0.1802277952106982, 0.15847890542125748, 0.18511063687368032, 0.1585136151538357, 0.18651086780377332, 0.1626022041457808, 0.1763707263483753]
Average MSE: 0.17255466222533758
Standard Deviation of MSE: 0.012181286640999406

RMSE scores for each fold: [0.3988050899837944, 0.40905120689347746, 0.4374511191413067, 0.4245324430602427, 0.3980940911659673, 0.43024485688231107, 0.3981376836646284, 0.43186904010796295, 0.40323963612941227, 0.41996514896878684]
Average RMSE: 0.41513903159978904
Standard Deviation of RMSE: 0.014637167339583063

R2 scores for each fold: [0.9568816262817194, 0.9538986828576901, 0.9468644173411779, 0.9512007991183398, 0.9576901097836746, 0.9492023218154141, 0.9567845315099544, 0.9492720782076298, 0.9553448042520508, 0.9518072590587887]
Average R2: 0.9528946630226439
Standard Deviation of R2: 0.003580593526319406

MAPE scores for each fold: [2.8353498292093

In [14]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_excel('/content/drive/MyDrive/T5.xlsx')

# Assume your dataset has all the embeddings as features and 'putting marks' as the target
X = df.drop(columns=['putting marks'])  # Features
y = df['putting marks']  # Target variable

# Split the data into training and testing sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

# Define the Gradient Boosting model
model = GradientBoostingRegressor(random_state=42)

# Set up k-fold cross-validation
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize lists to store metrics for each fold
mse_scores_train = []
rmse_scores_train = []
r2_scores_train = []
mape_scores_train = []

mse_scores_test = []
rmse_scores_test = []
r2_scores_test = []
mape_scores_test = []

# Perform k-fold cross-validation manually to compute metrics
for train_index, test_index in kf.split(X_train_full_scaled):
    X_train_fold, X_test_fold = X_train_full_scaled[train_index], X_train_full_scaled[test_index]
    y_train_fold, y_test_fold = y_train_full.iloc[train_index], y_train_full.iloc[test_index]

    # Fit the Gradient Boosting model on the training fold
    model.fit(X_train_fold, y_train_fold)

    # Predict on the training set and test set
    y_train_pred = model.predict(X_train_fold)
    y_test_pred = model.predict(X_test_fold)

    # Calculate metrics for training set
    mse_train = mean_squared_error(y_train_fold, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train_fold, y_train_pred)
    mape_train = mean_absolute_percentage_error(y_train_fold, y_train_pred) * 100

    # Calculate metrics for test set
    mse_test = mean_squared_error(y_test, model.predict(X_test_scaled))
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, model.predict(X_test_scaled))
    mape_test = mean_absolute_percentage_error(y_test, model.predict(X_test_scaled)) * 100

    # Store the metrics
    mse_scores_train.append(mse_train)
    rmse_scores_train.append(rmse_train)
    r2_scores_train.append(r2_train)
    mape_scores_train.append(mape_train)

    mse_scores_test.append(mse_test)
    rmse_scores_test.append(rmse_test)
    r2_scores_test.append(r2_test)
    mape_scores_test.append(mape_test)

# Print the metrics for training folds
print("Training Set Metrics:")
print(f"MSE scores for each fold: {mse_scores_train}")
print(f"Average MSE: {np.mean(mse_scores_train)}")
print(f"Standard Deviation of MSE: {np.std(mse_scores_train)}\n")

print(f"RMSE scores for each fold: {rmse_scores_train}")
print(f"Average RMSE: {np.mean(rmse_scores_train)}")
print(f"Standard Deviation of RMSE: {np.std(rmse_scores_train)}\n")

print(f"R2 scores for each fold: {r2_scores_train}")
print(f"Average R2: {np.mean(r2_scores_train)}")
print(f"Standard Deviation of R2: {np.std(r2_scores_train)}\n")

print(f"MAPE scores for each fold: {mape_scores_train}")
print(f"Average MAPE: {np.mean(mape_scores_train)}")
print(f"Standard Deviation of MAPE: {np.std(mape_scores_train)}\n")

# Print the metrics for the test set
print("Test Set Metrics:")
print(f"MSE: {mse_scores_test[-1]}")
print(f"RMSE: {rmse_scores_test[-1]}")
print(f"R2: {r2_scores_test[-1]}")
print(f"MAPE: {mape_scores_test[-1]}\n")

Training Set Metrics:
MSE scores for each fold: [0.17841163956750616, 0.2101339532174764, 0.19549727506136855, 0.21571766509174617, 0.19607361066487997, 0.2278686857669623, 0.1859783782318628, 0.22355517205755254, 0.2111420094324182, 0.2093979046341578]
Average MSE: 0.20537762937259307
Standard Deviation of MSE: 0.015175249711486895

RMSE scores for each fold: [0.4223880201515026, 0.4584037011385013, 0.44215073794054505, 0.46445415822419567, 0.44280199939124026, 0.477355931949067, 0.4312521051912243, 0.4728162138268447, 0.4595019145035396, 0.4576001580355472]
Average RMSE: 0.4528724940352208
Standard Deviation of RMSE: 0.016856260525751683

R2 scores for each fold: [0.9516313271335397, 0.9421032470351592, 0.9457165937320821, 0.9415914195686174, 0.9476532670420827, 0.9374687464565657, 0.9492968302028258, 0.9391966301055087, 0.9420143914324385, 0.942782687464056]
Average R2: 0.9439455140172877
Standard Deviation of R2: 0.004279135295887209

MAPE scores for each fold: [3.3817433942792024e