In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_excel('/content/drive/MyDrive/ML Files/Mathbert (1).xlsx')

# Assume your dataset has all the embeddings as features and 'putting marks' as the target
X = df.drop(columns=['putting marks'])  # Features
y = df['putting marks']  # Target variable

# Split the data into training and testing sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

# Define the Random Forest model
model = RandomForestRegressor(random_state=42)

# Set up k-fold cross-validation
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize lists to store metrics for each fold
mse_scores_train = []
rmse_scores_train = []
r2_scores_train = []
mape_scores_train = []

mse_scores_test = []
rmse_scores_test = []
r2_scores_test = []
mape_scores_test = []

# Perform k-fold cross-validation manually to compute metrics
for train_index, test_index in kf.split(X_train_full_scaled):
    X_train_fold, X_test_fold = X_train_full_scaled[train_index], X_train_full_scaled[test_index]
    y_train_fold, y_test_fold = y_train_full.iloc[train_index], y_train_full.iloc[test_index]

    # Fit the Random Forest model on the training fold
    model.fit(X_train_fold, y_train_fold)

    # Predict on the training set and test set
    y_train_pred = model.predict(X_train_fold)
    y_test_pred = model.predict(X_test_fold)

    # Calculate metrics for training set
    mse_train = mean_squared_error(y_train_fold, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train_fold, y_train_pred)
    mape_train = mean_absolute_percentage_error(y_train_fold, y_train_pred) * 100

    # Calculate metrics for test set
    mse_test = mean_squared_error(y_test, model.predict(X_test_scaled))
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, model.predict(X_test_scaled))
    mape_test = mean_absolute_percentage_error(y_test, model.predict(X_test_scaled)) * 100

    # Store the metrics
    mse_scores_train.append(mse_train)
    rmse_scores_train.append(rmse_train)
    r2_scores_train.append(r2_train)
    mape_scores_train.append(mape_train)

    mse_scores_test.append(mse_test)
    rmse_scores_test.append(rmse_test)
    r2_scores_test.append(r2_test)
    mape_scores_test.append(mape_test)

# Print the metrics for training folds
print("Training Set Metrics:")
print(f"MSE scores for each fold: {mse_scores_train}")
print(f"Average MSE: {np.mean(mse_scores_train)}")
print(f"Standard Deviation of MSE: {np.std(mse_scores_train)}\n")

print(f"RMSE scores for each fold: {rmse_scores_train}")
print(f"Average RMSE: {np.mean(rmse_scores_train)}")
print(f"Standard Deviation of RMSE: {np.std(rmse_scores_train)}\n")

print(f"R2 scores for each fold: {r2_scores_train}")
print(f"Average R2: {np.mean(r2_scores_train)}")
print(f"Standard Deviation of R2: {np.std(r2_scores_train)}\n")

print(f"MAPE scores for each fold: {mape_scores_train}")
print(f"Average MAPE: {np.mean(mape_scores_train)}")
print(f"Standard Deviation of MAPE: {np.std(mape_scores_train)}\n")

# Print the metrics for the test set
print("Test Set Metrics:")
print(f"MSE: {mse_scores_test[-1]}")
print(f"RMSE: {rmse_scores_test[-1]}")
print(f"R2: {r2_scores_test[-1]}")
print(f"MAPE: {mape_scores_test[-1]}\n")


Training Set Metrics:
MSE scores for each fold: [0.2644276134305941, 0.27374698919723217, 0.30241015959761613, 0.28022349266709295, 0.2775846884076372, 0.29533017604725453, 0.2667379573078781, 0.28728630143030603, 0.27751935532852295, 0.2872382272538658]
Average MSE: 0.28125049606679997
Standard Deviation of MSE: 0.011386802404504468

RMSE scores for each fold: [0.5142252555355427, 0.5232083611690778, 0.5499183208419375, 0.5293614008095915, 0.5268630641899631, 0.5434428912473274, 0.5164668017480679, 0.5359909527504229, 0.5268010585871321, 0.5359461048033335]
Average RMSE: 0.5302224211682396
Standard Deviation of RMSE: 0.010708882168083677

R2 scores for each fold: [0.928311780767847, 0.9245763877481606, 0.9160302692309639, 0.9241255629053482, 0.9258918550639821, 0.9189561037960072, 0.9272793962969634, 0.9218627997253841, 0.9237852819945219, 0.9215130664761485]
Average R2: 0.9232332504005326
Standard Deviation of R2: 0.003565810049309265

MAPE scores for each fold: [5.749257045010287e+1

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_excel('/content/drive/MyDrive/ML Files/T5.xlsx')

# Assume your dataset has all the embeddings as features and 'putting marks' as the target
X = df.drop(columns=['putting marks'])  # Features
y = df['putting marks']  # Target variable

# Split the data into training and testing sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

# Define the Random Forest model
model = RandomForestRegressor(random_state=42)

# Set up k-fold cross-validation
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize lists to store metrics for each fold
mse_scores_train = []
rmse_scores_train = []
r2_scores_train = []
mape_scores_train = []

mse_scores_test = []
rmse_scores_test = []
r2_scores_test = []
mape_scores_test = []

# Perform k-fold cross-validation manually to compute metrics
for train_index, test_index in kf.split(X_train_full_scaled):
    X_train_fold, X_test_fold = X_train_full_scaled[train_index], X_train_full_scaled[test_index]
    y_train_fold, y_test_fold = y_train_full.iloc[train_index], y_train_full.iloc[test_index]

    # Fit the Random Forest model on the training fold
    model.fit(X_train_fold, y_train_fold)

    # Predict on the training set and test set
    y_train_pred = model.predict(X_train_fold)
    y_test_pred = model.predict(X_test_fold)

    # Calculate metrics for training set
    mse_train = mean_squared_error(y_train_fold, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train_fold, y_train_pred)
    mape_train = mean_absolute_percentage_error(y_train_fold, y_train_pred) * 100

    # Calculate metrics for test set
    mse_test = mean_squared_error(y_test, model.predict(X_test_scaled))
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, model.predict(X_test_scaled))
    mape_test = mean_absolute_percentage_error(y_test, model.predict(X_test_scaled)) * 100

    # Store the metrics
    mse_scores_train.append(mse_train)
    rmse_scores_train.append(rmse_train)
    r2_scores_train.append(r2_train)
    mape_scores_train.append(mape_train)

    mse_scores_test.append(mse_test)
    rmse_scores_test.append(rmse_test)
    r2_scores_test.append(r2_test)
    mape_scores_test.append(mape_test)

# Print the metrics for training folds
print("Training Set Metrics:")
print(f"MSE scores for each fold: {mse_scores_train}")
print(f"Average MSE: {np.mean(mse_scores_train)}")
print(f"Standard Deviation of MSE: {np.std(mse_scores_train)}\n")

print(f"RMSE scores for each fold: {rmse_scores_train}")
print(f"Average RMSE: {np.mean(rmse_scores_train)}")
print(f"Standard Deviation of RMSE: {np.std(rmse_scores_train)}\n")

print(f"R2 scores for each fold: {r2_scores_train}")
print(f"Average R2: {np.mean(r2_scores_train)}")
print(f"Standard Deviation of R2: {np.std(r2_scores_train)}\n")

print(f"MAPE scores for each fold: {mape_scores_train}")
print(f"Average MAPE: {np.mean(mape_scores_train)}")
print(f"Standard Deviation of MAPE: {np.std(mape_scores_train)}\n")

# Print the metrics for the test set
print("Test Set Metrics:")
print(f"MSE: {mse_scores_test[-1]}")
print(f"RMSE: {rmse_scores_test[-1]}")
print(f"R2: {r2_scores_test[-1]}")
print(f"MAPE: {mape_scores_test[-1]}\n")


Training Set Metrics:
MSE scores for each fold: [0.2701532861652726, 0.3036061271061406, 0.2958722107205643, 0.30901175904524675, 0.29967364934846247, 0.3308034351708933, 0.2839618454117675, 0.3194369067572028, 0.31820834561985306, 0.29723554827312304]
Average MSE: 0.30279631136185264
Standard Deviation of MSE: 0.016873195506661377

RMSE scores for each fold: [0.5197627210230382, 0.5510046525267646, 0.5439413669878072, 0.5558882612947019, 0.5474245604176546, 0.5751551400890835, 0.5328807046720377, 0.5651874969929915, 0.5640995883883032, 0.5451931293341131]
Average RMSE: 0.5500537621726496
Standard Deviation of RMSE: 0.015400327320122684

R2 scores for each fold: [0.9267595098951875, 0.9163495062528793, 0.9178456507238285, 0.9163307364060288, 0.9199946568853679, 0.909221605381685, 0.9225836583762345, 0.9131183581182811, 0.9126109265434929, 0.9187813302523291]
Average R2: 0.9173595938835316
Standard Deviation of R2: 0.004834190907873833

MAPE scores for each fold: [5.801499155334782e+16,

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_excel('/content/drive/MyDrive/ML Files/Codebert (1).xlsx')

# Assume your dataset has all the embeddings as features and 'putting marks' as the target
X = df.drop(columns=['putting marks'])  # Features
y = df['putting marks']  # Target variable

# Split the data into training and testing sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

# Define the Random Forest model
model = RandomForestRegressor(random_state=42)

# Set up k-fold cross-validation
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize lists to store metrics for each fold
mse_scores_train = []
rmse_scores_train = []
r2_scores_train = []
mape_scores_train = []

mse_scores_test = []
rmse_scores_test = []
r2_scores_test = []
mape_scores_test = []

# Perform k-fold cross-validation manually to compute metrics
for train_index, test_index in kf.split(X_train_full_scaled):
    X_train_fold, X_test_fold = X_train_full_scaled[train_index], X_train_full_scaled[test_index]
    y_train_fold, y_test_fold = y_train_full.iloc[train_index], y_train_full.iloc[test_index]

    # Fit the Random Forest model on the training fold
    model.fit(X_train_fold, y_train_fold)

    # Predict on the training set and test set
    y_train_pred = model.predict(X_train_fold)
    y_test_pred = model.predict(X_test_fold)

    # Calculate metrics for training set
    mse_train = mean_squared_error(y_train_fold, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train_fold, y_train_pred)
    mape_train = mean_absolute_percentage_error(y_train_fold, y_train_pred) * 100

    # Calculate metrics for test set
    mse_test = mean_squared_error(y_test, model.predict(X_test_scaled))
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, model.predict(X_test_scaled))
    mape_test = mean_absolute_percentage_error(y_test, model.predict(X_test_scaled)) * 100

    # Store the metrics
    mse_scores_train.append(mse_train)
    rmse_scores_train.append(rmse_train)
    r2_scores_train.append(r2_train)
    mape_scores_train.append(mape_train)

    mse_scores_test.append(mse_test)
    rmse_scores_test.append(rmse_test)
    r2_scores_test.append(r2_test)
    mape_scores_test.append(mape_test)

# Print the metrics for training folds
print("Training Set Metrics:")
print(f"MSE scores for each fold: {mse_scores_train}")
print(f"Average MSE: {np.mean(mse_scores_train)}")
print(f"Standard Deviation of MSE: {np.std(mse_scores_train)}\n")

print(f"RMSE scores for each fold: {rmse_scores_train}")
print(f"Average RMSE: {np.mean(rmse_scores_train)}")
print(f"Standard Deviation of RMSE: {np.std(rmse_scores_train)}\n")

print(f"R2 scores for each fold: {r2_scores_train}")
print(f"Average R2: {np.mean(r2_scores_train)}")
print(f"Standard Deviation of R2: {np.std(r2_scores_train)}\n")

print(f"MAPE scores for each fold: {mape_scores_train}")
print(f"Average MAPE: {np.mean(mape_scores_train)}")
print(f"Standard Deviation of MAPE: {np.std(mape_scores_train)}\n")

# Print the metrics for the test set
print("Test Set Metrics:")
print(f"MSE: {mse_scores_test[-1]}")
print(f"RMSE: {rmse_scores_test[-1]}")
print(f"R2: {r2_scores_test[-1]}")
print(f"MAPE: {mape_scores_test[-1]}\n")


Training Set Metrics:
MSE scores for each fold: [0.2773872866349418, 0.2806490519902451, 0.3090657181506512, 0.29901270028418825, 0.28373172041093236, 0.30899914622121305, 0.27070089469410386, 0.29251850280335057, 0.2941433307039695, 0.29016049009229905]
Average MSE: 0.29063688419858946
Standard Deviation of MSE: 0.012179676029399542

RMSE scores for each fold: [0.5266756939853422, 0.5297632036959957, 0.5559367933053642, 0.5468205375479127, 0.5326647354677541, 0.5558769164313383, 0.5202892413783932, 0.5408497968968377, 0.5423498231805459, 0.5386654714127304]
Average RMSE: 0.5389892213302214
Standard Deviation of RMSE: 0.011291744259902608

R2 scores for each fold: [0.9247983205743481, 0.9226747101831775, 0.9141822312531409, 0.9190381216710928, 0.9242507517983813, 0.9152050932666378, 0.9261989831376505, 0.9204397260719336, 0.9192198649486737, 0.920714567434666]
Average R2: 0.9206722370339703
Standard Deviation of R2: 0.00375806547868231

MAPE scores for each fold: [5.642647550079113e+16