Machine Learning for Genomic Data. Task:
Apply machine learning algorithms, such as random forests or
support vector machines, to classify genomic data based on
specific features or markers. Deliverable: A comprehensive
analysis report presenting the classification results, model
performance evaluation, and insights into the predictive

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
df_train = pd.read_csv(r"F:\BE\My Cl2\B5\trainbio.csv")
df_test = pd.read_csv(r"F:\BE\My Cl2\B5\testbio.csv")

In [3]:
df_train.shape

(38, 7131)

In [4]:
df_test.shape

(34, 7131)

In [5]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,AFFX-BioB-5_at,AFFX-BioB-M_at,AFFX-BioB-3_at,AFFX-BioC-5_at,AFFX-BioC-3_at,AFFX-BioDn-5_at,AFFX-BioDn-3_at,AFFX-CreX-5_at,AFFX-CreX-3_at,...,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at,cancer
0,0,-214,-153,-58,88,-295,-558,199,-176,252,...,511,-125,389,-37,793,329,36,191,-37,0
1,1,-139,-73,-1,283,-264,-400,-330,-168,101,...,837,-36,442,-17,782,295,11,76,-14,0
2,2,-76,-49,-307,309,-376,-650,33,-367,206,...,1199,33,168,52,1138,777,41,228,-41,0
3,3,-135,-114,265,12,-419,-585,158,-253,49,...,835,218,174,-110,627,170,-50,126,-91,0
4,4,-106,-125,-76,168,-230,-284,4,-122,70,...,649,57,504,-26,250,314,14,56,-25,0


In [6]:
x_train = df_train.iloc[:, :-1]  # Features (all columns except the last)
y_train = df_train.iloc[:, -1]   # Target (last column)

x_test = df_test.iloc[:, :-1]    # Features (all columns except the last)
y_test = df_test.iloc[:, -1]      # Target (last column)

In [7]:
x_train.head()

Unnamed: 0.1,Unnamed: 0,AFFX-BioB-5_at,AFFX-BioB-M_at,AFFX-BioB-3_at,AFFX-BioC-5_at,AFFX-BioC-3_at,AFFX-BioDn-5_at,AFFX-BioDn-3_at,AFFX-CreX-5_at,AFFX-CreX-3_at,...,U48730_at,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at
0,0,-214,-153,-58,88,-295,-558,199,-176,252,...,185,511,-125,389,-37,793,329,36,191,-37
1,1,-139,-73,-1,283,-264,-400,-330,-168,101,...,169,837,-36,442,-17,782,295,11,76,-14
2,2,-76,-49,-307,309,-376,-650,33,-367,206,...,315,1199,33,168,52,1138,777,41,228,-41
3,3,-135,-114,265,12,-419,-585,158,-253,49,...,240,835,218,174,-110,627,170,-50,126,-91
4,4,-106,-125,-76,168,-230,-284,4,-122,70,...,156,649,57,504,-26,250,314,14,56,-25


In [8]:
y_train.head()

0    0
1    0
2    0
3    0
4    0
Name: cancer, dtype: int64

In [9]:
rf_model = RandomForestClassifier(random_state=0)
rf_model.fit(x_train, y_train)

In [10]:
rf_pred = rf_model.predict(x_test)


In [11]:
print("Random Forest Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, rf_pred)}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_pred))
print("\nClassification Report:")
print(classification_report(y_test, rf_pred))

Random Forest Model Performance:
Accuracy: 0.6470588235294118
Confusion Matrix:
[[18  2]
 [10  4]]

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.90      0.75        20
           1       0.67      0.29      0.40        14

    accuracy                           0.65        34
   macro avg       0.65      0.59      0.57        34
weighted avg       0.65      0.65      0.61        34



In [12]:
svm_model = SVC(random_state=0)
svm_model.fit(x_train, y_train)

svm_pred = svm_model.predict(x_test)

In [13]:
print("Support Vector Machine (SVM) Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, svm_pred)}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, svm_pred))
print("\nClassification Report:")
print(classification_report(y_test, svm_pred))

Support Vector Machine (SVM) Model Performance:
Accuracy: 0.6176470588235294
Confusion Matrix:
[[20  0]
 [13  1]]

Classification Report:
              precision    recall  f1-score   support

           0       0.61      1.00      0.75        20
           1       1.00      0.07      0.13        14

    accuracy                           0.62        34
   macro avg       0.80      0.54      0.44        34
weighted avg       0.77      0.62      0.50        34



In [14]:
print(rf_model.feature_importances_)

[0.01 0.   0.   ... 0.   0.   0.  ]
