## Build a machine learning  model for the dataset whether HIV-1 protease will cleave in the central position (between amino acids 4 and 5)

### Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append('/home/admin1/PycharmProjects/Machine Learning using libraries/')
from ipynb.fs.full.ml_library import *
import os
os.chdir('/home/admin1/PycharmProjects/Machine Learning using libraries/Classification/Datasets & pickled objects/')

# reading dataset from different files & storing in pandas dataframe
hiv_data = pd.read_table('746Data.txt', sep=',',names=['octamer','cleaves'])
hiv_data = hiv_data.append(pd.read_table('1625Data.txt', sep=',',names=['octamer','cleaves']))
hiv_data = hiv_data.append(pd.read_table('schillingData.txt', sep=',',names=['octamer','cleaves']))
hiv_data = hiv_data.append(pd.read_table('impensData.txt', sep=',',names=['octamer','cleaves']))
hiv_data.head()

Unnamed: 0,octamer,cleaves
0,AAAKFERQ,-1
1,AAAMKRHG,-1
2,AAAMSSAI,-1
3,AAKFERQH,-1
4,AAKFESNF,-1


#### Using already done data preprocessing steps

In [2]:
import joblib
file = open('DataProcessingHIV.pkl', 'rb')
feature = joblib.load(file)
label = joblib.load(file)
one_hot_encode = joblib.load(file)
file.close()

In [3]:
# using custom library fn for dividing octamer column into 8 separate features
x_values = separate_feature_column(hiv_data, feature)
x_values[:10]

array([['A', 'A', 'A', 'K', 'F', 'E', 'R', 'Q'],
       ['A', 'A', 'A', 'M', 'K', 'R', 'H', 'G'],
       ['A', 'A', 'A', 'M', 'S', 'S', 'A', 'I'],
       ['A', 'A', 'K', 'F', 'E', 'R', 'Q', 'H'],
       ['A', 'A', 'K', 'F', 'E', 'S', 'N', 'F'],
       ['A', 'A', 'M', 'K', 'R', 'H', 'G', 'L'],
       ['A', 'A', 'S', 'S', 'S', 'N', 'Y', 'C'],
       ['A', 'A', 'V', 'L', 'A', 'E', 'A', 'M'],
       ['A', 'C', 'E', 'G', 'N', 'P', 'Y', 'V'],
       ['A', 'C', 'K', 'N', 'G', 'Q', 'T', 'N']], dtype='<U32')

In [4]:
x_values.shape[-1]

8

#### one hot encoding for each column & created dense feature matrix

In [5]:
x_values = one_hot_encode.transform(x_values)
# created dense matrix from sparse matrix
x_values = x_values.todense()

y_values = hiv_data[label].values

In [6]:
x_values[:10]

matrix([[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 1., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]])

####  Getting prior probabilities

In [7]:
total = len(y_values)
prior_counts = np.array([len(y_values[y_values==-1]), len(y_values[y_values==1])])
prior_probabilities = prior_counts/total
prior_probabilities

array([0.79362671, 0.20637329])

#### Splitting dataset into train set & test set

In [20]:
from sklearn.model_selection import train_test_split
train_x_values, test_x_values, train_y_values, test_y_values = train_test_split(x_values, y_values, train_size=0.8, random_state=0)

### Building GaussianNB Classification model

In [31]:
from sklearn.naive_bayes import GaussianNB 

classifier = GaussianNB()
classifier.fit(train_x_values, train_y_values)

GaussianNB(priors=None, var_smoothing=1e-09)

#### Storing reusable objects into pickle file

In [32]:
import joblib

file = open('GaussianNBModelHIV.pkl', 'wb')
joblib.dump(classifier, file)
file.close()

#### Storing predictions for test set

In [33]:
test_prediction = classifier.predict(test_x_values)

### Evaluatinig model against test set

In [34]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(f'Accuracy score: {accuracy_score(test_y_values, test_prediction)}')

Accuracy score: 0.8368740515933232


In [35]:
print(f'Confusion matrix:\n {confusion_matrix(test_y_values, test_prediction)}')

Confusion matrix:
 [[842 201]
 [ 14 261]]
