## Import modules

In [201]:
from Bio.Seq import Seq
from Bio import SeqIO
import doctest
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import svm

## Generate combinations

In [133]:
num_to_char = {
    0 : 'a',
    1 : 't',
    2 : 'c',
    3 : 'g'
}

char_to_num = {
    'a' : 0,
    't' : 1,
    'c' : 2,
    'g' : 3
}

combinations = set()

negative_data =[]
positive_data = []

In [134]:
def decode(n):
    '''
    Test cases:
    
    >>> decode(0)
    'aaaa'
    
    >>> decode(1)
    'taaa'
    
    >>> decode(2)
    'caaa'
    
    >>> decode(255)
    'gggg'
    '''
    ret = ''
    for i in range(0, 8, 2):
        tmp = n & ((1 << i) | (1 << (i+1)))
        tmp = tmp >> i
        ch = num_to_char[tmp]
        ret = ret + ch
    return ret

In [135]:
doctest.testmod(verbose=False)

TestResults(failed=0, attempted=4)

In [136]:
def generate_combinations():
    for i in range(255):
        combination = decode(i)
        assert len(combination) == 4
        
        seq = Seq(combination)
        seq_rev = seq.reverse_complement()
        string = str(seq_rev)
        if (not(string in combinations)):
            combinations.add(combination)
            
    assert len(combinations) == 136
        

In [137]:
generate_combinations()

## Read training data

In [138]:
def read_data():
    

    for record in SeqIO.parse("vista1500", "fasta"):
        positive_data.append(str(record.seq).lower())

    for record in SeqIO.parse("randoms1500", "fasta"):
        negative_data.append(str(record.seq).lower())

In [139]:
read_data()

## Compute frequencies

In [142]:
def process_sequence(seq):
    
    frequency_map = {}
    
    for combination in combinations:
        frequency_map[combination] = 0
        
    assert(len(frequency_map) == len(combinations))
    
    for i in range(len(seq) - 4):
        
        combination = seq[i:i+4]
        
        if (combination in frequency_map):
            frequency_map[combination] = frequency_map[combination] + 1
        
        combination_reverse_complement = str(Seq(combination).reverse_complement())
        
        if (combination_reverse_complement in frequency_map):
            frequency_map[combination_reverse_complement] = frequency_map[combination_reverse_complement] + 1
        
    ret_arr = []
    for key in frequency_map:
        ret_arr.append( frequency_map[key] / 1500 )
    return ret_arr
        

In [158]:
def generate_training_data():
    
    x = []
    y = []
    
    for sq in positive_data:
        x.append(process_sequence(sq))
        y.append(1)
    for sq in negative_data:
        x.append(process_sequence(sq))
        y.append(0)
    return x, y

In [184]:
x, y = generate_training_data()

### shuffle the arrays

In [185]:
x, y = shuffle(x, y)

### split into train data and test data

In [295]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5) # 50% training and 30% test

### create random forest classifyer

In [296]:
clf=RandomForestClassifier(n_estimators=1000)

In [297]:
clf.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### evaluate the random forest classifyer

In [298]:
y_test_pred=clf.predict(x_test)
y_train_pred=clf.predict(x_train)

In [299]:
print("Test Accuracy:",metrics.accuracy_score(y_test, y_test_pred))
print("Train Accuracy:",metrics.accuracy_score(y_train, y_train_pred))

Test Accuracy: 0.803082191780822
Train Accuracy: 1.0


### create support vector machine classifyer

In [206]:
clf = svm.SVC()

In [210]:
clf.fit(x_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

### evaluate support vector machine classifyer

In [208]:
y_test_pred=clf.predict(x_test)

In [209]:
print("Test Accuracy:",metrics.accuracy_score(y_test, y_test_pred))

Accuracy: 0.7232524964336662


In [301]:
%pwd

'C:\\Users\\Habbab\\Project 4'