In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.preprocessing.imputation import Imputer
from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier
import matplotlib.pyplot as plt
import cPickle as pickle
import numpy as np

# Selecting Feature Group
group_features() allows us to select a particular group of feature i.e, protein features, ligand features or protein-ligand features.
The allowed values for "group_name" are 0, 1 or 2
- 0 - Protein Features
- 1 - Ligand Features
- 2 - Protein Ligand Features

In [None]:
# all_feature_names is a list of all the features obtained from the parser output file.
# group_name can be 0, 1 or 2.
# 0 - Protein Features
# 1 - Ligand Features
# 2 - Protein-Ligand Features
def group_features(all_feature_names, group_name):
    if group_name == 0:
        feature_names = pickle.load(open('data/protein_features.pkl', 'rb'))
    elif group_name == 1:
        feature_names = pickle.load(open('data/ligand_features.pkl', 'rb'))
    elif group_name == 2:
        feature_names = pickle.load(open('data/protein_ligand_features.pkl', 'rb'))

    # returns the indices of all the features that belong to a specific group
    indices = np.where(np.in1d(all_feature_names, feature_names['feat_name']))

    feat_name = all_feature_names[indices]
    return feat_name

# Hyper Parameter Tuning

The AdaBoost Classifier requires us to specify the number of weak learners to be used in training the model. In order to find the optimal number of weak learners that yield the best result, I have implemented a function parameter_tuning() that trains the model with varying number of weak learners in increments of 5 and tests it on a 5 fold cross validation set. Taking into account the computing resources required for the process, I have restricted the maximum number of weak learners to 600 

In [None]:
def parameter_tuning(X, Y):
    
    # base_estimators for Adaboost from 5 to 601 in increments of 5
    estimator_list = np.arange(5, 601, 5)
    
    # 5 fold cross validation
    ss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    final_score_list = []
    
    for each_value in estimator_list:
        f_score_list = []
        for train_index, test_index in ss.split(X, Y):
            X_train = X.iloc[train_index]
            Y_train = Y.iloc[train_index]
            X_test = X.iloc[test_index]
            Y_test = Y.iloc[test_index]

            adaboost = AdaBoostClassifier(n_estimators=each_value, random_state=0)
            adaboost.fit(X_train, Y_train)

            predicted_labels = adaboost.predict(X_test)
            precision, recall, f_score, _ = precision_recall_fscore_support(Y_test, predicted_labels, average='binary')
            
            f_score_list.append(f_score)
        final_score_list.append(sum(f_score_list)/len(f_score_list))
    
    # plot the Fscore for increasing number of base estimators. 
    plt.plot(np.arange(5,601,5), final_score_list, 'r', label='test f_score')
    plt.xlabel('n_estimators')
    plt.ylabel('f_score')
    plt.legend(loc='best')
    plt.show()


# AdaBoost Classifier
AdaBoost Classifier is composed of a number of weak learners that is combined using a weighted sum to produce the final output of the boosted model. For more information on AdaBoost here is a useful                              [link](http://mccormickml.com/2013/12/13/adaboost-tutorial/)

feature_selection()
- Uses Random Forest Classifier to determine the feature importances and produces the 10 most important features.

preprocessing()
- Does basic preprocessing on the data which includes accounting for missing values, imputation strategy, etc

sep_cluster()
- Used to separates individual protein clusters from the data, if needed
- Random Sampling equal number the positive and negative samples for training

k_cross_validation()
- It is used to optimize the hyper parameters that produces the best result.

adaboost()
- The classifier is trained on the features obtained from feature selection module.
- If the flag use_cross_val is set to 1, it tests on the cross validation set and if it is set to 0, then it tests on the blind test set.  


In [None]:
def feature_selection(X, Y, col_names):
    
    rf = RandomForestRegressor(n_jobs=-1, random_state=0)
    rf.fit(X, Y)
    print "Features sorted by their score:"
    
    # mapping feature importances to feature names
    features = sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), col_names), 
                 reverse=True)
    scores, feature_names = zip(*features)
    feature_names = list(each_name.strip() for each_name in feature_names)
    
    # returns the name of top 10 features 
    return feature_names[:10]

def preprocessing(df_cluster):
    df_cluster.replace(to_replace='na', value=np.nan, inplace=True)
    imputer = Imputer()
    
    # imputes missing values by the mean of the respective column
    df_cluster = pd.DataFrame(imputer.fit_transform(df_cluster), columns=df_cluster.columns)
    return df_cluster

def sep_cluster(cluster_name):
    df_cluster = pd.read_csv('data/ml_pro_features_labels_train_final.csv')
    if cluster_name != 'all_cluster':
        
        # selectes the samples based on the protein cluster number
        df_cluster = df_cluster[df_cluster['cluster_number'] == cluster_name]
    df_cluster = preprocessing(df_cluster)
    df_cluster.drop(['mmgbsaEnergy', 'cluster_number'], axis=1, inplace=True)
    
    # shuffles the training data
    df_cluster = df_cluster.sample(frac=1).reset_index(drop=True)
    
    # selects 4000 positive samples
    pos_df = df_cluster[df_cluster['label'] == 1][0:4000]
    
    # selects 4000 negative samples
    neg_df = df_cluster[df_cluster['label'] == 0][0:4000]
    pos_neg_comb = pd.concat([pos_df, neg_df])
    return pos_neg_comb

def k_cross_validation(X, Y, k):
    f_score_list = []
    acc_list = []
    precision_list = []
    recall_list = []
    
    # splits the data into k folds
    ss = StratifiedShuffleSplit(n_splits=k, test_size=0.2, random_state=0)
    for train_index, test_index in ss.split(X, Y):
        X_train = X.iloc[train_index]
        Y_train = Y.iloc[train_index]
        X_test = X.iloc[test_index]
        Y_test = Y.iloc[test_index]

        adaboost = AdaBoostClassifier(n_estimators=1000, random_state=0)
        adaboost.fit(X_train, Y_train)
        predicted_labels = adaboost.predict(X_test)
        
        # computes the accuracy
        acc = accuracy_score(Y_test, predicted_labels)
        
        # computes the precision, recall and F-score
        precision, recall, f_score, _ = precision_recall_fscore_support(Y_test, predicted_labels, average='binary')
        f_score_list.append(f_score)
        acc_list.append(acc)
        precision_list.append(precision)
        recall_list.append(recall)
        
    print "Accuracy", sum(acc_list)/len(acc_list)
    print "Precision", sum(precision_list)/len(precision_list)
    print "Recall", sum(recall_list)/len(recall_list)
    print "fscore", sum(f_score_list)/len(f_score_list)
    
def adaboost(use_cross_val):
    pos_neg_comb = sep_cluster('all_cluster')
    pos_neg_comb = pos_neg_comb.sample(frac=1).reset_index(drop=True)
    Y = pos_neg_comb['label']
    X = pos_neg_comb.drop(['label'], axis=1)
    names = X.columns.values
    
    # Features identified after feature selection
    feat_names = ['H-049', 'P_VSA_MR_8', 'F03[N-Cl]', 'F01[C-N]', 'B03[N-Cl]', 'T(N..N)', 'PCR', 'SaaN', 'SsssCH', 'Eig02_EA(dm)']
    
    ## to be uncommented only when a group of features is used in training the classifier
    #protein_features = group_features(names, 1)
    #protein_features = np.append(protein_features, ['label'])

    X = X[feat_names]
    if use_cross_val == 1:
        k_cross_validation(X, Y, 5)
    else:
        test_set = pd.read_csv('data/ml_pro_features_labels_test_final.csv')
        Y_test = test_set['label']
        X_test = test_set.drop(['label','moleculeName'], axis=1)
        X_test = X_test[feat_names]
        
        # preprocessing the test data
        X_test = preprocessing(X_test)
        adaboost = AdaBoostClassifier(n_estimators=1000, random_state=0)
        adaboost.fit(X, Y)
        predicted_labels = adaboost.predict(X_test)
        acc = accuracy_score(Y_test, predicted_labels)
        precision, recall, f_score, _ = precision_recall_fscore_support(Y_test, predicted_labels, average='binary')
        print "Accuracy", acc
        print "Precision", precision
        print "Recall", recall
        print "fscore", f_score

if __name__=="__main__":
    adaboost(1)