In [1]:
import gzip
import pandas as pd
import numpy as np
import json
import glob
import os
from sklearn.model_selection import GroupShuffleSplit 
from pre_processing import pre_processing_entire_kmer, add_labels, add_features, train_test_split

In [5]:
json_df = pre_processing_entire_kmer()

In [22]:
labelled_df = add_labels(json_df)

In [29]:
result_df = add_features(labelled_df)

In [None]:
result_df = pd.read_csv("C:/Users/wanfe/Desktop/DSA4266/Project2/labelled_w_features.csv")

In [5]:
train_x, train_y, test_x, test_y = train_test_split(result_df, random_state=42)

In [54]:
#save for future use
#result_df.to_csv('labelled_w_features.csv', index=False)
#train_x_resampled.to_csv('C:/Users/wanfe/Desktop/DSA4266/Project2/train_x_resampled.csv', index=False)
#train_y_resampled.to_csv('C:/Users/wanfe/Desktop/DSA4266/Project2/train_y_resampled.csv', index=False)

In [15]:
#Oversampling numeric features
from imblearn.over_sampling import SMOTE

#initialise
smote = SMOTE(sampling_strategy='auto')

#select only numeric cols
train_x_prep = train_x[['AA_count', 'AC_count', 'AG_count', 'AT_count', 'A_count', 'CA_count',
       'CC_count', 'CG_count', 'CT_count', 'C_count', 'GA_count', 'GC_count',
       'GG_count', 'GT_count', 'G_count', 'TA_count', 'TC_count', 'TG_count',
       'TT_count', 'T_count', 'dwelling_length1', 'dwelling_length2',
       'dwelling_length3', 'mean_signal1',
       'mean_signal2', 'mean_signal3', 'sd_signal1', 'sd_signal2',
       'sd_signal3']]

#apply smote
train_x_resampled, train_y_resampled = smote.fit_resample(train_x_prep, train_y)

In [None]:
#check to see if data balanced
train_y_resampled.value_counts()

In [16]:
#Normalise
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

#select only the signal features to normalise
signal_features = ['dwelling_length1', 'dwelling_length2',
       'dwelling_length3', 'mean_signal1',
       'mean_signal2', 'mean_signal3', 'sd_signal1', 'sd_signal2',
       'sd_signal3']

#apply
train_x_resampled[signal_features] = scaler.fit_transform(train_x_resampled[signal_features])

In [62]:
#XGB Model Training
import xgboost as xgb

# Create and train the XGBoost model
xgb_model = xgb.XGBClassifier()

#selected numeric features for xgb
selected_cols = ['AA_count', 'AC_count', 'AG_count', 'AT_count', 'A_count', 'CA_count',
       'CC_count', 'CG_count', 'CT_count', 'C_count', 'GA_count', 'GC_count',
       'GG_count', 'GT_count', 'G_count', 'TA_count', 'TC_count', 'TG_count',
       'TT_count', 'T_count', 'dwelling_length1', 'dwelling_length2',
       'dwelling_length3', 'mean_signal1',
       'mean_signal2', 'mean_signal3', 'sd_signal1', 'sd_signal2',
       'sd_signal3']

#convert data type from object to numeric so it can feed into xgb
train_x_resampled[selected_cols] = train_x_resampled[selected_cols].apply(pd.to_numeric, errors='coerce')

#fit the model
xgb_model.fit(train_x_resampled[selected_cols], train_y_resampled)

In [70]:
#XGB Model Evaluation using ROC
from sklearn.metrics import accuracy_score, roc_auc_score

#predictions using test set
test_x[selected_cols] = test_x[selected_cols].apply(pd.to_numeric, errors='coerce')
new_test_x = test_x[selected_cols]
y_pred = xgb_model.predict(new_test_x)

#calculate acc and roc auc
accuracy = accuracy_score(test_y, y_pred)
roc_auc = roc_auc_score(test_y, xgb_model.predict_proba(new_test_x)[:, 1])

print(f'Accuracy: {accuracy:.2f}')
print(f'ROC AUC: {roc_auc:.2f}')


Accuracy: 0.93
ROC AUC: 0.70


In [17]:
#XGB Model Training Without Necleotide and Dincleotide Counts
import xgboost as xgb

# Create and train the XGBoost model
xgb_wo = xgb.XGBClassifier()

#selected numeric features for xgb
selected_cols = ['dwelling_length1', 'dwelling_length2',
       'dwelling_length3', 'mean_signal1',
       'mean_signal2', 'mean_signal3', 'sd_signal1', 'sd_signal2',
       'sd_signal3']

#convert datatype from object to numeric
train_x_resampled[selected_cols] = train_x_resampled[selected_cols].apply(pd.to_numeric, errors='coerce')

#fit
xgb_wo.fit(train_x_resampled[selected_cols], train_y_resampled)

In [20]:
#XGB Model Evaluation 
from sklearn.metrics import accuracy_score, roc_auc_score

#predictions using test set
test_x[selected_cols] = test_x[selected_cols].apply(pd.to_numeric, errors='coerce')
test_x_wo = test_x[selected_cols]
y_pred = xgb_wo.predict(test_x_wo)

#calculate acc and roc auc
accuracy = accuracy_score(test_y, y_pred)
roc_auc = roc_auc_score(test_y, xgb_wo.predict_proba(test_x_wo)[:, 1])

print(f'Accuracy: {accuracy:.2f}')
print(f'ROC AUC: {roc_auc:.2f}')

Accuracy: 0.55
ROC AUC: 0.51


In [7]:
#function to do one hot encoding
def one_hot_encode_sequence(sequence):
    encoding = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1]}
    one_hot_sequence = [encoding[n] for n in sequence]
    return one_hot_sequence

#apply one-hot encoding to the kmer_sequence column
train_x_onehot, test_x_onehot = train_x, test_x
train_x_onehot['kmer_sequence'] = train_x_onehot['kmer_sequence'].apply(one_hot_encode_sequence)
test_x_onehot['kmer_sequence'] = test_x_onehot['kmer_sequence'].apply(one_hot_encode_sequence)

#flattening
train_x_onehot = pd.concat([train_x_onehot, train_x_onehot['kmer_sequence'].apply(pd.Series)], axis=1)
test_x_onehot = pd.concat([test_x_onehot, test_x_onehot['kmer_sequence'].apply(pd.Series)], axis=1)


In [14]:
test_x_onehot['kmer_sequence'].head(1)

14986    [[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1,...
Name: kmer_sequence, dtype: object

In [8]:
#Normalise
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
signal_features = ['dwelling_length1', 'dwelling_length2',
       'dwelling_length3', 'mean_signal1',
       'mean_signal2', 'mean_signal3', 'sd_signal1', 'sd_signal2',
       'sd_signal3']
train_x_onehot[signal_features] = scaler.fit_transform(train_x_onehot[signal_features])

In [None]:
#ONEHOT XGB Model Training
import xgboost as xgb

#Create and train the XGBoost model
xbg_onehot_model = xgb.XGBClassifier()

#selected numeric features for xgb
selected_cols = ['AA_count', 'AC_count', 'AG_count', 'AT_count', 'A_count', 'CA_count',
       'CC_count', 'CG_count', 'CT_count', 'C_count', 'GA_count', 'GC_count',
       'GG_count', 'GT_count', 'G_count', 'TA_count', 'TC_count', 'TG_count',
       'TT_count', 'T_count', 'dwelling_length1', 'dwelling_length2',
       'dwelling_length3', 'mean_signal1',
       'mean_signal2', 'mean_signal3', 'sd_signal1', 'sd_signal2',
       'sd_signal3']

onehot_training = train_x_onehot[['AA_count', 'AC_count', 'AG_count', 'AT_count', 'A_count', 'CA_count',
       'CC_count', 'CG_count', 'CT_count', 'C_count', 'GA_count', 'GC_count',
       'GG_count', 'GT_count', 'G_count', 'TA_count', 'TC_count', 'TG_count',
       'TT_count', 'T_count', 'dwelling_length1', 'dwelling_length2',
       'dwelling_length3', 'mean_signal1',
       'mean_signal2', 'mean_signal3', 'sd_signal1', 'sd_signal2',
       'sd_signal3', 'kmer_sequence']]
onehot_training[selected_cols] = onehot_training[selected_cols].apply(pd.to_numeric, errors='coerce')

#error when fitting due to invalid data types
xbg_onehot_model.fit(train_x_onehot, train_y)

In [None]:
#embedding kmer sequence col
import gensim

#cols we want to select
selected_columns = ['AA_count', 'AC_count', 'AG_count', 'AT_count', 'A_count', 'CA_count', 'CC_count', 'CG_count', 'CT_count', 'C_count', 'GA_count', 'GC_count', 'GG_count', 'GT_count', 'G_count', 'TA_count', 'TC_count', 'TG_count', 'TT_count', 'T_count', 'dwelling_length1', 'dwelling_length2', 'dwelling_length3', 'kmer_sequence', 'mean_signal1', 'mean_signal2', 'mean_signal3', 'sd_signal1', 'sd_signal2', 'sd_signal3']
train_x_embedded = train_x[selected_columns]

#train Word2Vec model on kmer_sequence
word2vec_model = gensim.models.Word2Vec(sentences=train_x_embedded['kmer_sequence'], vector_size=100, window=5, min_count=1, sg=0)

#function to get average word2vec
def average_word2vec(kmer_sequence, model, size):
    words = kmer_sequence.split()
    vectorized = [model.wv[word] for word in words if word in model.wv]
    if not vectorized:
        return [0.0] * size
    return np.mean(vectorized, axis=0)

#convert kmer_sequences to word embeddings and average it
train_x_embedded['kmer_sequence'] = train_x_embedded['kmer_sequence'].apply(lambda x: average_word2vec(x, word2vec_model, size=100))

#flatten into individual numeric features
train_x_embedded = pd.concat([train_x_embedded, train_x_embedded['kmer_sequence'].apply(pd.Series)], axis=1)

# Drop the original 'kmer_sequence' column
train_x_embedded = train_x_embedded.drop('kmer_sequence', axis=1)

# Apply SMOTE to oversample
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
train_x_embedded_resampled, train_y_embedded_resampled = smote.fit_resample(train_x_embedded, train_y)


In [18]:
#Normalise
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
signal_features = ['dwelling_length1', 'dwelling_length2',
       'dwelling_length3', 'mean_signal1',
       'mean_signal2', 'mean_signal3', 'sd_signal1', 'sd_signal2',
       'sd_signal3']
train_x_embedded_resampled[signal_features] = scaler.fit_transform(train_x_embedded_resampled[signal_features])