In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load the dataset
df = pd.read_csv(r"C:\Users\Sandesh\Desktop\New folder\Dataset_corr.csv")
df

# Assuming the target column is in the phenotype dataset
X = df.drop('Phenotype', axis=1)
y = df['Phenotype']


# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Step 4: Define base learners
base_learners = [
    ('linear', LinearRegression()),
    ('tree', DecisionTreeRegressor(max_depth=5, random_state=42))
]

# Step 5: Ridge hyperparameter tuning using GridSearchCV
ridge_params = {'alpha': np.logspace(-3, 3, 50)}  # Standard range of Ridge alphas
ridge = Ridge()

grid_search = GridSearchCV(estimator=ridge, param_grid=ridge_params, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# Best Ridge model after tuning
best_ridge = grid_search.best_estimator_
print(f"Best Ridge alpha: {grid_search.best_params_['alpha']}")

# Step 6: Create Stacking Regressor with tuned Ridge as meta-model
stacked_regressor = StackingRegressor(estimators=base_learners, final_estimator=best_ridge, cv=5)
stacked_regressor.fit(X_train, y_train)

# Step 7: Evaluate the model
y_pred_train = stacked_regressor.predict(X_train)
y_pred_test = stacked_regressor.predict(X_test)

# Compute metrics
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_test = mean_squared_error(y_test, y_pred_test)

# Step 8: Calculate Adjusted R²
def adjusted_r2(r2, n, p):
    return 1 - ((1 - r2) * (n - 1) / (n - p - 1))

n_train, p_train = X_train.shape
n_test, p_test = X_test.shape

adj_r2_train = adjusted_r2(r2_train, n_train, p_train)
adj_r2_test = adjusted_r2(r2_test, n_test, p_test)

# Print the results
print(f"R² (Train): {r2_train:.4f}")
print(f"R² (Test): {r2_test:.4f}")

print(f"RMSE (Train): {rmse_train:.4f}")
print(f"RMSE (Test): {rmse_test:.4f}")


Best Ridge alpha: 0.001
R² (Train): 0.9538
R² (Test): 0.9443
RMSE (Train): 0.2030
RMSE (Test): 0.0719


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load the dataset
df = pd.read_csv(r"C:\Users\Sandesh\Desktop\New folder\Dataset_rfe.csv")
df

# Assuming the target column is in the phenotype dataset
X = df.drop('Phenotype', axis=1)
y = df['Phenotype']


# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Step 4: Define base learners
base_learners = [
    ('linear', LinearRegression()),
    ('tree', DecisionTreeRegressor(max_depth=5, random_state=42))
]

# Step 5: Ridge hyperparameter tuning using GridSearchCV
ridge_params = {'alpha': np.logspace(-3, 3, 50)}  # Standard range of Ridge alphas
ridge = Ridge()

grid_search = GridSearchCV(estimator=ridge, param_grid=ridge_params, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# Best Ridge model after tuning
best_ridge = grid_search.best_estimator_
print(f"Best Ridge alpha: {grid_search.best_params_['alpha']}")

# Step 6: Create Stacking Regressor with tuned Ridge as meta-model
stacked_regressor = StackingRegressor(estimators=base_learners, final_estimator=best_ridge, cv=5)
stacked_regressor.fit(X_train, y_train)

# Step 7: Evaluate the model
y_pred_train = stacked_regressor.predict(X_train)
y_pred_test = stacked_regressor.predict(X_test)

# Compute metrics
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_test = mean_squared_error(y_test, y_pred_test)

# Step 8: Calculate Adjusted R²
def adjusted_r2(r2, n, p):
    return 1 - ((1 - r2) * (n - 1) / (n - p - 1))

n_train, p_train = X_train.shape
n_test, p_test = X_test.shape

adj_r2_train = adjusted_r2(r2_train, n_train, p_train)
adj_r2_test = adjusted_r2(r2_test, n_test, p_test)

# Print the results
print(f"R² (Train): {r2_train:.4f}")
print(f"R² (Test): {r2_test:.4f}")

print(f"RMSE (Train): {rmse_train:.4f}")
print(f"RMSE (Test): {rmse_test:.4f}")


Best Ridge alpha: 0.001
R² (Train): 0.8593
R² (Test): 0.9807
RMSE (Train): 0.3543
RMSE (Test): 0.0248


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load the dataset
genotype_data = pd.read_excel(r"C:\Users\Sandesh\Desktop\New folder\k_feature_top_200_selected_features.xlsx")
phenotype_data = pd.read_csv(r"C:\Users\Sandesh\Desktop\New folder\phenotype.csv")

# Standardize the genotype dataset (optional, improves numerical stability)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
genotype_scaled = scaler.fit_transform(genotype_data)
phenotype_scaled = scaler.fit_transform(phenotype_data)


X = genotype_scaled
y = phenotype_scaled

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Assuming the target column is in the phenotype dataset
X = df.drop('Phenotype', axis=1)
y = df['Phenotype']


# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Step 4: Define base learners
base_learners = [
    ('linear', LinearRegression()),
    ('tree', DecisionTreeRegressor(max_depth=5, random_state=42))
]

# Step 5: Ridge hyperparameter tuning using GridSearchCV
ridge_params = {'alpha': np.logspace(-3, 3, 50)}  # Standard range of Ridge alphas
ridge = Ridge()

grid_search = GridSearchCV(estimator=ridge, param_grid=ridge_params, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# Best Ridge model after tuning
best_ridge = grid_search.best_estimator_
print(f"Best Ridge alpha: {grid_search.best_params_['alpha']}")

# Step 6: Create Stacking Regressor with tuned Ridge as meta-model
stacked_regressor = StackingRegressor(estimators=base_learners, final_estimator=best_ridge, cv=5)
stacked_regressor.fit(X_train, y_train)

# Step 7: Evaluate the model
y_pred_train = stacked_regressor.predict(X_train)
y_pred_test = stacked_regressor.predict(X_test)

# Compute metrics
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_test = mean_squared_error(y_test, y_pred_test)

# Step 8: Calculate Adjusted R²
def adjusted_r2(r2, n, p):
    return 1 - ((1 - r2) * (n - 1) / (n - p - 1))

n_train, p_train = X_train.shape
n_test, p_test = X_test.shape

adj_r2_train = adjusted_r2(r2_train, n_train, p_train)
adj_r2_test = adjusted_r2(r2_test, n_test, p_test)

# Print the results
print(f"R² (Train): {r2_train:.4f}")
print(f"R² (Test): {r2_test:.4f}")

print(f"RMSE (Train): {rmse_train:.4f}")
print(f"RMSE (Test): {rmse_test:.4f}")


Best Ridge alpha: 0.001
R² (Train): 0.8593
R² (Test): 0.9807
RMSE (Train): 0.3543
RMSE (Test): 0.0248
