### MODEL PIPELINE

In [3]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    GradientBoostingRegressor, VotingClassifier, StackingRegressor
)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error
from xgboost import XGBClassifier, XGBRegressor
import joblib

In [4]:
# ---------------------------------------------
# 1. Load and Copy Data
# ---------------------------------------------
df = pd.read_csv('gwas_main.csv')

In [None]:

input_cols = ['SNPS', 'LOCATION', 'RISK_ALLELE', 'MAPPED_GENE', 'RISK ALLELE FREQUENCY']
target_class = 'DISEASE/TRAIT'
target_reg1 = 'OR or BETA'
target_reg2 = 'polygenic_score'


In [None]:

class_counts = df[target_class].value_counts()
valid_classes = class_counts[class_counts > 2].index
df = df[df[target_class].isin(valid_classes)].reset_index(drop=True)


In [None]:

label_encoders = {}
for col in ['SNPS', 'LOCATION', 'RISK_ALLELE', 'MAPPED_GENE']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Encode Target Class
le_target = LabelEncoder()
df[target_class] = le_target.fit_transform(df[target_class].astype(str))
label_encoders['DISEASE/TRAIT'] = le_target


In [None]:

X = df[input_cols]
y_class = df[target_class]
y_reg1 = df[target_reg1]
y_reg2 = df[target_reg2]


In [9]:
# ---------------------------------------------
# 6. Stratified Train-Test Split
# ---------------------------------------------
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in sss.split(X, y_class):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_class_train, y_class_test = y_class.iloc[train_idx], y_class.iloc[test_idx]
    y_reg1_train, y_reg1_test = y_reg1.iloc[train_idx], y_reg1.iloc[test_idx]
    y_reg2_train, y_reg2_test = y_reg2.iloc[train_idx], y_reg2.iloc[test_idx]

In [10]:
# ---------------------------------------------
# 7. Classification Models
# ---------------------------------------------
print("\nðŸŽ¯ Classification Models:")

models_class = {
    "rf": RandomForestClassifier(n_estimators=100, random_state=42),
    "xgb": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "logreg": LogisticRegression(max_iter=500),
}

for name, model in models_class.items():
    model.fit(X_train, y_class_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_class_test, pred)
    print(f"{name.upper()} Accuracy: {acc:.4f}")
    joblib.dump(model, f"{name}_classifier.pkl")

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[(name, model) for name, model in models_class.items()],
    voting='soft'
)
voting_clf.fit(X_train, y_class_train)
voting_acc = accuracy_score(y_class_test, voting_clf.predict(X_test))
print(f"VotingClassifier Accuracy: {voting_acc:.4f}")
joblib.dump(voting_clf, "voting_classifier.pkl")


ðŸŽ¯ Classification Models:
RF Accuracy: 0.7596


Parameters: { "use_label_encoder" } are not used.



XGB Accuracy: 0.7760
LOGREG Accuracy: 0.7322


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



VotingClassifier Accuracy: 0.7650


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['voting_classifier.pkl']

In [11]:
# ---------------------------------------------
# 8. Regression Models (OR or BETA)
# ---------------------------------------------
print("\nðŸ“ˆ Regression Models (OR or BETA):")

models_reg = {
    "rf": RandomForestRegressor(n_estimators=100, random_state=42),
    "xgb": XGBRegressor(random_state=42),
    "linreg": LinearRegression(),
    "gbr": GradientBoostingRegressor(random_state=42),
}

for name, model in models_reg.items():
    model.fit(X_train, y_reg1_train)
    pred = model.predict(X_test)
    mse = mean_squared_error(y_reg1_test, pred)
    print(f"{name.upper()} MSE (OR or BETA): {mse:.4f}")
    joblib.dump(model, f"{name}_or_beta.pkl")

# Stacking Regressor (OR or BETA)
stack_reg1 = StackingRegressor(
    estimators=[(name, model) for name, model in models_reg.items()],
    final_estimator=LinearRegression()
)
stack_reg1.fit(X_train, y_reg1_train)
stack_pred1 = stack_reg1.predict(X_test)
mse_stack1 = mean_squared_error(y_reg1_test, stack_pred1)
print(f"StackingRegressor MSE (OR or BETA): {mse_stack1:.4f}")
joblib.dump(stack_reg1, "stacking_or_beta.pkl")



ðŸ“ˆ Regression Models (OR or BETA):
RF MSE (OR or BETA): 0.0047
XGB MSE (OR or BETA): 0.0052
LINREG MSE (OR or BETA): 0.0059
GBR MSE (OR or BETA): 0.0048
StackingRegressor MSE (OR or BETA): 0.0046


['stacking_or_beta.pkl']

In [12]:
# ---------------------------------------------
# 9. Regression Models (Polygenic Score)
# ---------------------------------------------
print("\nðŸ“ˆ Regression Models (Polygenic Score):")

for name, model in models_reg.items():
    model.fit(X_train, y_reg2_train)
    pred = model.predict(X_test)
    mse = mean_squared_error(y_reg2_test, pred)
    print(f"{name.upper()} MSE (Polygenic Score): {mse:.4f}")
    joblib.dump(model, f"{name}_polygenic.pkl")

# Stacking Regressor (Polygenic Score)
stack_reg2 = StackingRegressor(
    estimators=[(name, model) for name, model in models_reg.items()],
    final_estimator=LinearRegression()
)
stack_reg2.fit(X_train, y_reg2_train)
stack_pred2 = stack_reg2.predict(X_test)
mse_stack2 = mean_squared_error(y_reg2_test, stack_pred2)
print(f"StackingRegressor MSE (Polygenic Score): {mse_stack2:.4f}")
joblib.dump(stack_reg2, "stacking_polygenic.pkl")


ðŸ“ˆ Regression Models (Polygenic Score):
RF MSE (Polygenic Score): 34.3853
XGB MSE (Polygenic Score): 28.9873
LINREG MSE (Polygenic Score): 39.2213
GBR MSE (Polygenic Score): 27.1424
StackingRegressor MSE (Polygenic Score): 34.9880


['stacking_polygenic.pkl']

In [14]:
# ---------------------------------------------
# 10. Save Encoders
# ---------------------------------------------
joblib.dump(label_encoders, "label_encoders.pkl")

print("\nâœ… All models and encoders saved successfully.")
# ---------------------------------------------
# End of Script
# ---------------------------------------------


âœ… All models and encoders saved successfully.


In [15]:
# ---------------------------------------------
# 11. Final Summary of All Scores
# ---------------------------------------------
print("\nðŸ“Š FINAL MODEL PERFORMANCE SUMMARY\n" + "-"*40)

print("ðŸŽ¯ Classification Accuracies:")
for name, model in models_class.items():
    acc = accuracy_score(y_class_test, model.predict(X_test))
    print(f"{name.upper():<10}: {acc:.4f}")
voting_acc = accuracy_score(y_class_test, voting_clf.predict(X_test))
print(f"{'VOTING':<10}: {voting_acc:.4f}")

print("\nðŸ“ˆ Regression MSE (OR or BETA):")
for name, model in models_reg.items():
    mse = mean_squared_error(y_reg1_test, model.predict(X_test))
    print(f"{name.upper():<10}: {mse:.4f}")
print(f"{'STACKING':<10}: {mse_stack1:.4f}")

print("\nðŸ“ˆ Regression MSE (Polygenic Score):")
for name, model in models_reg.items():
    mse = mean_squared_error(y_reg2_test, model.predict(X_test))
    print(f"{name.upper():<10}: {mse:.4f}")
print(f"{'STACKING':<10}: {mse_stack2:.4f}")

print("\nâœ… Training Complete. Models and scores ready.")



ðŸ“Š FINAL MODEL PERFORMANCE SUMMARY
----------------------------------------
ðŸŽ¯ Classification Accuracies:
RF        : 0.7596
XGB       : 0.7760
LOGREG    : 0.7322
VOTING    : 0.7650

ðŸ“ˆ Regression MSE (OR or BETA):
RF        : 10.0725
XGB       : 6.0702
LINREG    : 2.6098
GBR       : 7.1612
STACKING  : 0.0046

ðŸ“ˆ Regression MSE (Polygenic Score):
RF        : 34.3853
XGB       : 28.9873
LINREG    : 39.2213
GBR       : 27.1424
STACKING  : 34.9880

âœ… Training Complete. Models and scores ready.
