Task is to look at 60 DNA sequence elements (called "nucleotides" or "base-pairs") and decide if this is a
	a) "intron -> exon" boundary (ie) [These are sometimes called "donors"]
	b) "exon -> intron" boundary (ei) [These are sometimes called "acceptors"]
	c) neither      
 
 Data Source is here:
 https://archive.ics.uci.edu/ml/datasets/Molecular+Biology+%28Splice-junction+Gene+Sequences%29

In [5]:
# -*- coding: utf-8 -*-


import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import re


dataset = pd.read_csv('data.csv')


#Take a look at the data 
dataset.describe()

#3178 Unique donors and 3092 unique sequences


Unnamed: 0,Class,Donor,Sequence
count,3190,3190,3190
unique,3,3178,3092
top,N,HUMMYLCA-DONOR-2388,CAAATTGTGGACGTGATTCCCTTTCTCAGGGTGAG...
freq,1655,2,3


In [6]:
#Create Duplicate column with boolean values for all duplicate donors
dataset['Duplicate'] = dataset.duplicated('Donor')

#Display index of duplicates
print (dataset[dataset['Duplicate'] == True].index.tolist())




[86, 559, 560, 561, 562, 563, 853, 1317, 1327, 1328, 1329, 1330]


In [7]:
#Check one duplicate value
dup_donor = dataset.iloc[86]['Donor']

print (dataset[dataset['Donor'] == dup_donor].index.tolist())

#index 84 and 86 are duplicate donors strings


[84, 86]


In [8]:
print (re.sub("\s+","",dataset.iloc[84]['Sequence']), re.sub("\s+","",dataset.iloc[86]['Sequence']))

#confirmed they have same sequence so we can skip duplicate donors

GACGATAAGGAGACCTGCTTTGCCGAGGAGGTACTACAGTTCTCTTCATTTTAATATGTC GACGATAAGGAGACCTGCTTTGCCGAGGAGGTACTACAGTTCTCTTCATTTTAATATGTC


In [9]:

#Filter out duplicate donors
data = dataset.loc[dataset['Duplicate'] == False]
data= data.reset_index(drop=True)

#Create target variable with encoded class
y = pd.DataFrame(data['Class'])
y.loc[:,'Class'] = y['Class'].replace({'EI': '0'}, regex=True)
y.loc[:,'Class'] = y['Class'].replace({'IE': '1'}, regex=True)
y.loc[:,'Class'] = y['Class'].replace({'N': '2'}, regex=True)
y = y.iloc[:,-1]

#Function to calculate letter frequency in a word
def letterCount(word):
    return {c: word.count(c) for c in word}


X = pd.DataFrame()

#Loop throgh all rows
#  - Convert char to unicode
#  - Calculate letter frequencies to create new features
for i in range (0,data.shape[0]):
    
    #Extract sequence
    seq = data.loc[i,'Sequence']
    
    #Remove whitespace
    seq = re.sub("\s+","", seq)
    
    #Conver char to unicode
    for j in range (0,len(seq)):
        X.loc[i,j] = ord(seq[j])
    
    #Count frequency of alphabets
    cnt = letterCount(seq)
    #Feature Engineering
    for key in (cnt):
        X.loc[i,key] = cnt[key]
    
#Replace missing values of frequency counts with 0
X.fillna(0, inplace=True)



In [14]:
X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,58,59,C,G,A,T,N,D,R,S
count,3178.0,3178.0,3178.0,3178.0,3178.0,3178.0,3178.0,3178.0,3178.0,3178.0,...,3178.0,3178.0,3178.0,3178.0,3178.0,3178.0,3178.0,3178.0,3178.0,3178.0
mean,71.554122,71.535557,71.658276,71.551605,71.680302,71.640025,71.581183,71.650094,71.642542,72.000944,...,71.520138,71.645689,15.778792,15.751101,13.940214,14.511013,0.017621,0.000629,0.000315,0.000315
std,7.153631,7.271922,7.235089,7.249761,7.448685,7.229547,7.313983,7.409403,7.355018,7.526702,...,7.276049,7.102581,5.712102,5.487449,5.012061,5.243703,0.73301,0.025082,0.017739,0.017739
min,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,...,65.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,67.0,67.0,67.0,67.0,65.0,67.0,67.0,67.0,67.0,67.0,...,67.0,67.0,12.0,12.0,10.0,11.0,0.0,0.0,0.0,0.0
50%,71.0,67.0,71.0,67.0,67.0,67.0,67.0,67.0,67.0,71.0,...,67.0,71.0,15.0,15.0,13.0,14.0,0.0,0.0,0.0,0.0
75%,71.0,71.0,71.0,71.0,84.0,71.0,71.0,71.0,71.0,84.0,...,71.0,71.0,20.0,19.0,17.0,18.0,0.0,0.0,0.0,0.0
max,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,...,84.0,84.0,40.0,37.0,44.0,44.0,41.0,1.0,1.0,1.0


In [10]:



#Build the model
from sklearn.model_selection import KFold,cross_val_score
import xgboost as xgb


# train model
model = xgb.XGBClassifier(max_depth= 10,   min_child_weight= 3,\
                         subsample= 0.5, \
                         objective= 'multi:softprob', silent= 1)

seed = 5
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

results = cross_val_score(model, X, y, cv=kfold)


print ("Average Accuracy = ",np.average(results))



Average Accuracy =  0.960668015793
