In [7]:
# Machine Learning for Genomic Data Classification
# -------------------------------------------------
# This script demonstrates how to classify genomic data
# using Random Forest and SVM with basic optimization.

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# -----------------------------
# 1. Load the genomic dataset
# -----------------------------
# Example CSV format:
# Feature1,Feature2,Feature3,...,Label
# 0.12,0.56,0.33,...,Cancer
# 0.34,0.22,0.76,...,Normal

df = pd.read_csv("genomic_data.csv")   # replace with your file name
print("‚úÖ Data Loaded Successfully")
print(df.head())

# -----------------------------
# 2. Split into features & labels
# -----------------------------
X = df.drop("Label", axis=1)
y = df["Label"]

# -----------------------------
# 3. Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 4. Feature Scaling
# -----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------------
# 5. Random Forest Classifier
# -----------------------------
rf = RandomForestClassifier(random_state=42)
rf_params = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5]
}

rf_grid = GridSearchCV(rf, rf_params, cv=3, n_jobs=-1, scoring='accuracy')
rf_grid.fit(X_train_scaled, y_train)

rf_best = rf_grid.best_estimator_
y_pred_rf = rf_best.predict(X_test_scaled)

print("\nüå≤ Random Forest Results")
print("Best Parameters:", rf_grid.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

# -----------------------------
# 6. Support Vector Machine (SVM)
# -----------------------------
svm = SVC(random_state=42)
svm_params = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"]
}

svm_grid = GridSearchCV(svm, svm_params, cv=3, n_jobs=-1, scoring='accuracy')
svm_grid.fit(X_train_scaled, y_train)

svm_best = svm_grid.best_estimator_
y_pred_svm = svm_best.predict(X_test_scaled)

print("\n‚öôÔ∏è SVM Results")
print("Best Parameters:", svm_grid.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

# -----------------------------
# 7. Compare Models
# -----------------------------
print("\nüìä Model Comparison:")
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.3f}")
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm):.3f}")


[8, 10]
0.21428571428571427
[(0, 9)]
