<a href="https://colab.research.google.com/github/umeshasewwandi39/project-01/blob/main/GWAS_SNP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

url = 'https://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST000001-GCST001000/GCST000392/GCST000392_buildGRCh37.tsv.gz'

# Load compressed file directly
df = pd.read_csv(url, sep='\t', compression='gzip')

# View columns and sample rows
print(df.columns)
df.head()

HTTPError: HTTP Error 404: Not Found

In [None]:
# Step 2: Preprocessing
df = df.dropna()
df['label'] = (df['p_value'] < 5e-8).astype(int)

features = ['beta', 'standard_error', 'effect_allele_frequency']
X = df[features]
y = df['label']

In [None]:
# Step 3: Classic ML Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
print("Random Forest AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
print("XGBoost AUC:", roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1]))

In [None]:
# Step 4: Deep Learning - CNN Model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout

X_train_cnn = X_train.values.reshape(-1, X_train.shape[1], 1)
X_test_cnn = X_test.values.reshape(-1, X_test.shape[1], 1)

model = Sequential([
    Conv1D(32, 2, activation='relu', input_shape=(X_train.shape[1], 1)),
    Dropout(0.3),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC()])
model.fit(X_train_cnn, y_train, epochs=10, validation_data=(X_test_cnn, y_test))

In [None]:
# Step 5: Feature Importance
import matplotlib.pyplot as plt

plt.barh(features, rf.feature_importances_)
plt.title('Feature Importance (Random Forest)')
plt.show()