In [2]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

# Load genotype and phenotype datasets
genotype_data = pd.read_csv(r"C:\Users\Sandesh\Desktop\New folder\gene.csv")
phenotype_data = pd.read_csv(r"C:\Users\Sandesh\Desktop\New folder\phenotype.csv")


# Step 1: Standardize the genotype dataset (optional, improves numerical stability)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
genotype_scaled = scaler.fit_transform(genotype_data)

# Step 2: Define the model for RFE (e.g., Linear Regression)
model = LinearRegression()

# Step 3: Perform RFE to select the top 200 features
rfe = RFE(estimator=model, n_features_to_select=200)
genotype_selected = rfe.fit_transform(genotype_scaled, phenotype_data)

# Step 4: Retrieve the indices of the selected features
selected_gene_indices = np.where(rfe.support_)[0]

# Step 5: Print results
print("Shape of the reduced genotype dataset:", genotype_selected.shape)
print("Indices of the top 200 selected genes:", selected_gene_indices[:200])


Shape of the reduced genotype dataset: (50, 200)
Indices of the top 200 selected genes: [   0    1    4   22  122  139  141  232  237  251  271  272  289  304
  338  359  380  429  452  471  542  558  560  561  563  574  596  670
  713  714  727  756  775  781  784  785  811  826  829  862  872  874
  877  880  885  888  898  904  917  921  925  930  936  967  981  998
 1004 1010 1011 1016 1021 1025 1031 1041 1042 1043 1045 1054 1060 1066
 1068 1086 1091 1097 1102 1123 1132 1134 1156 1157 1182 1187 1194 1197
 1199 1210 1227 1232 1236 1240 1242 1243 1247 1248 1249 1254 1258 1263
 1265 1303 1306 1311 1313 1317 1349 1353 1365 1376 1381 1404 1406 1408
 1411 1419 1433 1435 1436 1439 1445 1454 1459 1460 1461 1463 1469 1480
 1489 1493 1495 1504 1520 1527 1532 1535 1538 1541 1550 1556 1561 1569
 1570 1571 1590 1599 1608 1609 1610 1613 1622 1627 1633 1637 1638 1665
 1670 1672 1697 1703 1716 1726 1729 1730 1732 1737 1740 1744 1748 1765
 1771 1777 1794 1799 1800 1803 1804 1809 1819 1823 1832 1863

In [3]:
# Extract the selected genes and forming of new dataframe

genotype_final = genotype_scaled[:, selected_gene_indices]
genotype_final = pd.DataFrame(genotype_final,columns=[f"X{i+1}" for i in selected_gene_indices])
genotype_final

# genotype_final.to_csv('Dataset_rfe.csv')

Unnamed: 0,X1,X2,X5,X23,X123,X140,X142,X233,X238,X252,...,X1918,X1920,X1927,X1930,X1963,X1974,X1977,X1982,X1989,X1997
0,-0.649031,0.119273,2.353556,0.908668,-0.454665,-0.740131,1.249409,0.801956,0.649709,0.26295,...,1.423379,0.720419,2.03709,0.141332,1.29657,-1.183504,0.442679,-0.413977,-0.602507,0.208378
1,-0.288666,-0.195187,1.408439,0.796089,0.086003,1.243292,1.870702,0.257438,0.861677,-0.876364,...,-0.850299,-0.297017,-0.425125,0.274113,-0.58304,0.04586,-1.100593,1.040791,-0.885349,-0.474962
2,1.663063,-0.211167,-0.273625,-0.89907,-1.028738,0.672719,-0.417953,-0.29985,-0.12173,0.434746,...,-1.620022,0.446556,0.982061,-0.775336,0.028276,-1.077046,0.522352,-0.442893,-0.314378,0.854413
3,0.039391,1.363528,0.588263,-1.352057,-0.981674,1.954665,0.016771,-1.125915,1.520529,0.808002,...,0.757219,0.752082,-0.81021,0.738596,-1.162766,1.510314,0.801964,0.263006,0.591696,-0.247326
4,0.103522,-0.415218,-0.432703,0.171238,-1.207863,0.156121,1.843358,0.701698,-0.695573,0.224966,...,0.101769,0.290557,-1.819871,-0.566678,0.182514,-0.100316,-0.033742,0.531822,1.785798,0.09565
5,1.833653,1.528495,-0.498711,-0.252496,0.32397,0.049837,-0.264955,0.875053,0.650891,2.1902,...,1.348671,1.195885,1.068708,0.29135,-0.797106,0.664587,1.230574,-0.029376,-0.807512,0.224586
6,0.465338,-1.891188,-0.831759,-0.946856,-2.071102,-0.609105,-0.87001,0.981235,-1.754213,-0.677879,...,-1.157614,1.079472,1.073554,0.157062,0.555005,0.225416,0.051247,-0.984781,-0.968501,0.660066
7,-1.417756,0.488879,-0.624923,0.096738,1.389318,-1.226313,0.484004,-1.581982,0.227847,0.686603,...,1.003838,-0.007859,0.373293,-0.286521,0.440929,-0.431658,-1.498035,0.829847,-0.809103,0.026859
8,-0.786913,-0.025162,1.769357,-1.273221,-0.260418,-0.277592,2.505378,0.875743,0.11375,-1.750412,...,-0.505517,-0.279675,-0.099817,-3.042619,-2.325575,-1.26279,-1.081312,1.308432,0.553487,0.17024
9,-0.523766,0.077574,-0.048522,0.172164,0.315686,-0.584802,-0.612856,-1.494527,-0.332617,2.764921,...,-0.124618,-1.921436,-0.720714,0.189774,-0.044217,0.799435,1.467394,-0.195531,-0.35632,-0.90601


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
df = pd.read_csv(r"C:\Users\Sandesh\Desktop\New folder\Dataset_rfe.csv")
df

Unnamed: 0,X1,X2,X5,X23,X123,X140,X142,X233,X238,X252,...,X1920,X1927,X1930,X1963,X1974,X1977,X1982,X1989,X1997,Phenotype
0,-0.649031,0.119273,2.353556,0.908668,-0.454665,-0.740131,1.249409,0.801956,0.649709,0.26295,...,0.720419,2.03709,0.141332,1.29657,-1.183504,0.442679,-0.413977,-0.602507,0.208378,-1.01344
1,-0.288666,-0.195187,1.408439,0.796089,0.086003,1.243292,1.870702,0.257438,0.861677,-0.876364,...,-0.297017,-0.425125,0.274113,-0.58304,0.04586,-1.100593,1.040791,-0.885349,-0.474962,-0.132362
2,1.663063,-0.211167,-0.273625,-0.89907,-1.028738,0.672719,-0.417953,-0.29985,-0.12173,0.434746,...,0.446556,0.982061,-0.775336,0.028276,-1.077046,0.522352,-0.442893,-0.314378,0.854413,1.168459
3,0.039391,1.363528,0.588263,-1.352057,-0.981674,1.954665,0.016771,-1.125915,1.520529,0.808002,...,0.752082,-0.81021,0.738596,-1.162766,1.510314,0.801964,0.263006,0.591696,-0.247326,-1.466054
4,0.103522,-0.415218,-0.432703,0.171238,-1.207863,0.156121,1.843358,0.701698,-0.695573,0.224966,...,0.290557,-1.819871,-0.566678,0.182514,-0.100316,-0.033742,0.531822,1.785798,0.09565,0.286618
5,1.833653,1.528495,-0.498711,-0.252496,0.32397,0.049837,-0.264955,0.875053,0.650891,2.1902,...,1.195885,1.068708,0.29135,-0.797106,0.664587,1.230574,-0.029376,-0.807512,0.224586,0.016954
6,0.465338,-1.891188,-0.831759,-0.946856,-2.071102,-0.609105,-0.87001,0.981235,-1.754213,-0.677879,...,1.079472,1.073554,0.157062,0.555005,0.225416,0.051247,-0.984781,-0.968501,0.660066,1.840866
7,-1.417756,0.488879,-0.624923,0.096738,1.389318,-1.226313,0.484004,-1.581982,0.227847,0.686603,...,-0.007859,0.373293,-0.286521,0.440929,-0.431658,-1.498035,0.829847,-0.809103,0.026859,-1.217927
8,-0.786913,-0.025162,1.769357,-1.273221,-0.260418,-0.277592,2.505378,0.875743,0.11375,-1.750412,...,-0.279675,-0.099817,-3.042619,-2.325575,-1.26279,-1.081312,1.308432,0.553487,0.17024,-0.658392
9,-0.523766,0.077574,-0.048522,0.172164,0.315686,-0.584802,-0.612856,-1.494527,-0.332617,2.764921,...,-1.921436,-0.720714,0.189774,-0.044217,0.799435,1.467394,-0.195531,-0.35632,-0.90601,-0.241915


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Columns: 201 entries, X1 to Phenotype
dtypes: float64(201)
memory usage: 78.6 KB


In [6]:
X = df.drop('Phenotype', axis=1)
y = df['Phenotype']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
# Ridge Regression
# Without Hyperparameter Tuning

ridge = Ridge()
ridge.fit(X_train, y_train)

In [10]:
# testing data evaluation 
y_predict_test  = ridge.predict(X_test)

Mean_squared_error = mean_squared_error(y_test,y_predict_test)
print('Mean_squared_error (Test) =',Mean_squared_error)

Root_Mean_squared_error = np.sqrt(Mean_squared_error)
print('Root_Mean_squared_error (Test) =',Root_Mean_squared_error)

R2_score = r2_score(y_test,y_predict_test)
print('R2_score (Test) =',R2_score)


# training data evaluation 
y_predict_train = ridge.predict(X_train)

Mean_squared_error = mean_squared_error(y_train,y_predict_train)
print('Mean_squared_error (Train) =',Mean_squared_error)

Root_Mean_squared_error = np.sqrt(Mean_squared_error)
print('Root_Mean_squared_error (Train) =',Root_Mean_squared_error)

R2_score = r2_score(y_train,y_predict_train)
print('R2_score (Train) =',R2_score)


Mean_squared_error (Test) = 0.12354407213994298
Root_Mean_squared_error (Test) = 0.35148836700514424
R2_score (Test) = 0.9042535683917088
Mean_squared_error (Train) = 2.7192200584264077e-06
Root_Mean_squared_error (Train) = 0.0016490057787728967
R2_score (Train) = 0.9999969530577218


In [11]:
# Lasso Regression 
# Without Hyperparameter Tuning

lasso = Lasso()
lasso.fit(X_train, y_train)

In [12]:
# Testing Data Evaluation
y_predict_test = lasso.predict(X_test)

Mean_squared_error_test = mean_squared_error(y_test, y_predict_test)
print('Mean_squared_error (Test) =', Mean_squared_error_test)

Root_Mean_squared_error_test = np.sqrt(Mean_squared_error_test)
print('Root_Mean_squared_error (Test) =', Root_Mean_squared_error_test)

R2_score_test = r2_score(y_test, y_predict_test)
print('R2_score (Test) =', R2_score_test)


# Training Data Evaluation
y_predict_train = lasso.predict(X_train)

Mean_squared_error_train = mean_squared_error(y_train, y_predict_train)
print('Mean_squared_error (Train) =', Mean_squared_error_train)

Root_Mean_squared_error_train = np.sqrt(Mean_squared_error_train)
print('Root_Mean_squared_error (Train) =', Root_Mean_squared_error_train)

R2_score_train = r2_score(y_train, y_predict_train)
print('R2_score (Train) =', R2_score_train)

Mean_squared_error (Test) = 1.3116773154684755
Root_Mean_squared_error (Test) = 1.145284818492097
R2_score (Test) = -0.016547538075243784
Mean_squared_error (Train) = 0.8924422618557198
Root_Mean_squared_error (Train) = 0.9446916226238697
R2_score (Train) = 0.0


In [13]:
# Elastic Net Regression 
# Without Hyperparameter Tuning

elastic_net = ElasticNet()
elastic_net.fit(X_train, y_train)

In [15]:
# Testing Data Evaluation
y_predict_test = elastic_net.predict(X_test)

Mean_squared_error_test = mean_squared_error(y_test, y_predict_test)
print('Mean_squared_error (Test) =', Mean_squared_error_test)

Root_Mean_squared_error_test = np.sqrt(Mean_squared_error_test)
print('Root_Mean_squared_error (Test) =', Root_Mean_squared_error_test)

R2_score_test = r2_score(y_test, y_predict_test)
print('R2_score (Test) =', R2_score_test)


# Training Data Evaluation
y_predict_train = elastic_net.predict(X_train)

Mean_squared_error_train = mean_squared_error(y_train, y_predict_train)
print('Mean_squared_error (Train) =', Mean_squared_error_train)

Root_Mean_squared_error_train = np.sqrt(Mean_squared_error_train)
print('Root_Mean_squared_error (Train) =', Root_Mean_squared_error_train)

R2_score_train = r2_score(y_train, y_predict_train)
print('R2_score (Train) =', R2_score_train)

Mean_squared_error (Test) = 1.1419669424124927
Root_Mean_squared_error (Test) = 1.0686285334074197
R2_score (Test) = 0.11497769292585358
Mean_squared_error (Train) = 0.7410410833842384
Root_Mean_squared_error (Train) = 0.8608374314493058
R2_score (Train) = 0.1696481497376222


In [17]:
# Hyperparameter Tuning for Ridge Regression
ridge = Ridge()
params = {"alpha": np.logspace(-3, 3, 50)}  # Searching across a range of alpha values
ridge_cv = GridSearchCV(ridge, param_grid=params, cv=5, scoring="r2")
ridge_cv.fit(X_train, y_train)

In [18]:
# Best model from GridSearchCV
ridge_best = ridge_cv.best_estimator_
print("Best Alpha:", ridge_cv.best_params_["alpha"])

Best Alpha: 0.001


In [19]:
# Testing Data Evaluation
y_predict_test = ridge_best.predict(X_test)

Mean_squared_error_test = mean_squared_error(y_test, y_predict_test)
print('Mean_squared_error (Test) =', Mean_squared_error_test)

Root_Mean_squared_error_test = np.sqrt(Mean_squared_error_test)
print('Root_Mean_squared_error (Test) =', Root_Mean_squared_error_test)

R2_score_test = r2_score(y_test, y_predict_test)
print('R2_score (Test) =', R2_score_test)


# Training Data Evaluation
y_predict_train = ridge_best.predict(X_train)

Mean_squared_error_train = mean_squared_error(y_train, y_predict_train)
print('Mean_squared_error (Train) =', Mean_squared_error_train)

Root_Mean_squared_error_train = np.sqrt(Mean_squared_error_train)
print('Root_Mean_squared_error (Train) =', Root_Mean_squared_error_train)

R2_score_train = r2_score(y_train, y_predict_train)
print('R2_score (Train) =', R2_score_train)

Mean_squared_error (Test) = 0.1224458993462506
Root_Mean_squared_error (Test) = 0.3499227048167218
R2_score (Test) = 0.9051046503130352
Mean_squared_error (Train) = 2.733327337238426e-12
Root_Mean_squared_error (Train) = 1.653277755623182e-06
R2_score (Train) = 0.9999999999969372


In [20]:
# Hyperparameter Tuning for Ridge Regression
ridge = Ridge()
params = {'alpha':np.arange(0.1,1,0.1)}
ridge_cv = GridSearchCV(ridge, param_grid=params, cv=5, scoring="r2")
ridge_cv.fit(X_train, y_train)

In [21]:
# Best model from GridSearchCV
ridge_best = ridge_cv.best_estimator_
print("Best Alpha:", ridge_cv.best_params_["alpha"])

Best Alpha: 0.1


In [24]:
# Testing Data Evaluation
y_predict_test = ridge_best.predict(X_test)

Mean_squared_error_test = mean_squared_error(y_test, y_predict_test)
print('Mean_squared_error (Test) =', Mean_squared_error_test)

Root_Mean_squared_error_test = np.sqrt(Mean_squared_error_test)
print('Root_Mean_squared_error (Test) =', Root_Mean_squared_error_test)

R2_score_test = r2_score(y_test, y_predict_test)
print('R2_score (Test) =', R2_score_test)

# Training Data Evaluation
y_predict_train = ridge_best.predict(X_train)

Mean_squared_error_train = mean_squared_error(y_train, y_predict_train)
print('Mean_squared_error (Train) =', Mean_squared_error_train)

Root_Mean_squared_error_train = np.sqrt(Mean_squared_error_train)
print('Root_Mean_squared_error (Train) =', Root_Mean_squared_error_train)

R2_score_train = r2_score(y_train, y_predict_train)
print('R2_score (Train) =', R2_score_train)

Mean_squared_error (Test) = 0.12255478062468009
Root_Mean_squared_error (Test) = 0.35007824928818426
R2_score (Test) = 0.9050202675199316
Mean_squared_error (Train) = 2.731921055620855e-08
Root_Mean_squared_error (Train) = 0.00016528523998291122
R2_score (Train) = 0.99999996938826


In [25]:
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error, r2_score

In [26]:
# Bayesian Ridge Regression
bayesian_ridge = BayesianRidge()
bayesian_ridge.fit(X_train, y_train)

In [27]:
# Testing Data Evaluation
y_predict_test = bayesian_ridge.predict(X_test)

Mean_squared_error_test = mean_squared_error(y_test, y_predict_test)
print('Mean_squared_error (Test) =', Mean_squared_error_test)

Root_Mean_squared_error_test = np.sqrt(Mean_squared_error_test)
print('Root_Mean_squared_error (Test) =', Root_Mean_squared_error_test)

R2_score_test = r2_score(y_test, y_predict_test)
print('R2_score (Test) =', R2_score_test)


# Training Data Evaluation
y_predict_train = bayesian_ridge.predict(X_train)

Mean_squared_error_train = mean_squared_error(y_train, y_predict_train)
print('Mean_squared_error (Train) =', Mean_squared_error_train)

Root_Mean_squared_error_train = np.sqrt(Mean_squared_error_train)
print('Root_Mean_squared_error (Train) =', Root_Mean_squared_error_train)

R2_score_train = r2_score(y_train, y_predict_train)
print('R2_score (Train) =', R2_score_train)


Mean_squared_error (Test) = 0.12244626168979855
Root_Mean_squared_error (Test) = 0.3499232225643199
R2_score (Test) = 0.9051043694974434
Mean_squared_error (Train) = 4.830919742636229e-12
Root_Mean_squared_error (Train) = 2.1979353363182066e-06
R2_score (Train) = 0.9999999999945869


In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import BayesianRidge
import numpy as np

# Define the model
bayesian_ridge = BayesianRidge()

# Define hyperparameters to tune
param_grid = {
    'alpha_1': [1e-12, 1e-10, 1e-8, 1e-6, 1e-4, 1e-2],
    'alpha_2': [1e-12, 1e-10, 1e-8, 1e-6, 1e-4, 1e-2],
    'lambda_1': [1e-12, 1e-10, 1e-8, 1e-6, 1e-4, 1e-2],
    'lambda_2': [1e-12, 1e-10, 1e-8, 1e-6, 1e-4, 1e-2],
    'n_iter': [50, 100, 200, 300, 400, 500]
}

# Define scoring metric
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=bayesian_ridge,
    param_grid=param_grid,
    scoring=scorer,
    cv=5,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Display best parameters and corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)

# Evaluate on the test set
best_model = grid_search.best_estimator_
y_predict_test = best_model.predict(X_test)


Best Parameters: {'alpha_1': 0.01, 'alpha_2': 1e-12, 'lambda_1': 1e-08, 'lambda_2': 0.01, 'n_iter': 50}
Best Score: 0.08982110554156317




In [29]:
# Testing Data Evaluation

Mean_squared_error_test = mean_squared_error(y_test, y_predict_test)
print('Mean_squared_error (Test) =', Mean_squared_error_test)

Root_Mean_squared_error_test = np.sqrt(Mean_squared_error_test)
print('Root_Mean_squared_error (Test) =', Root_Mean_squared_error_test)

R2_score_test = r2_score(y_test, y_predict_test)
print('R2_score (Test) =', R2_score_test)


# Training Data Evaluation
y_predict_train = best_model.predict(X_train)

Mean_squared_error_train = mean_squared_error(y_train, y_predict_train)
print('Mean_squared_error (Train) =', Mean_squared_error_train)

Root_Mean_squared_error_train = np.sqrt(Mean_squared_error_train)
print('Root_Mean_squared_error (Train) =', Root_Mean_squared_error_train)

R2_score_train = r2_score(y_train, y_predict_train)
print('R2_score (Train) =', R2_score_train)

Mean_squared_error (Test) = 0.12244479947677026
Root_Mean_squared_error (Test) = 0.3499211332240027
R2_score (Test) = 0.9051055027098857
Mean_squared_error (Train) = 3.1124535723732913e-24
Root_Mean_squared_error (Train) = 1.764214718330309e-12
R2_score (Train) = 1.0
