In [None]:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import sklearn
import sklearn.datasets
import sklearn.linear_model
import sklearn.metrics
import sklearn.model_selection
import sklearn.neighbors
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.svm
import sklearn.ensemble

import os

In [None]:

os.chdir("C:\\Users\\Will\\Desktop\\ML")
os.getcwd()

In [None]:
input = 'grepgem.txt'
annotation = 'conditions_nozero.txt'

In [None]:
def loadgem(gem, annot, attribute):
    
    gem = pd.read_table(gem, sep='\t')
    anot = pd.read_table(annot, sep='\t')
    gem = gem.T
    samples = gem.index
    cond = anot[attribute].to_frame()
    cond = cond.set_index(samples)
    gem[attribute] = cond
    gem = gem.fillna(-10)
    gem = gem.sort_values(attribute)
    
    return gem



In [None]:
def train(gem, attribute):
    
    X = gem.iloc[:, :-1]
    y = gem.loc[:, attribute]


    # split the dataset into train and test sets
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.3)

    # print shapes of train set and test set
    print("X_train shape: (%d, %d)" % X_train.shape)
    print("y_train shape: (%d,)" % y_train.shape)
    print("X_test shape: (%d, %d)" % X_test.shape)
    print("y_test shape: (%d,)" % y_test.shape)

    model = sklearn.ensemble.RandomForestClassifier()

    model.fit(X_train, y_train) 

    y_pred = model.predict(X_test)

    accuracy = sum(y_pred == y_test) / len(y_test) * 100
    acc = "%0.2f" % accuracy

    print(str("Accuracy: " + str(acc) + "%"))
    
    return y_test, y_pred, X_train, model
    

In [None]:

def confusion(gem, attribute, obj):

    classes = []
    cond = gem[attribute]
    for item in cond:
        if item not in classes:
            classes.append(item)



    # compute confusion matrix for the ground truth and predicted labels
    cnf_matrix = sklearn.metrics.confusion_matrix(obj[0], obj[1])

    # plot a heatmap of the confusion matrix
    title = str("Confusion Matrix: " + str(attribute))
    sns.heatmap(cnf_matrix, annot=True, fmt="d", cbar=False, square=True, xticklabels=classes, yticklabels=classes)
    plt.ylabel("Expected")
    plt.xlabel("Measured")
    plt.title(title)
    plt.show()


In [None]:
def extract(model):
    imp = pd.DataFrame(model[3].feature_importances_,
                                   index = model[2].columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
    return imp

In [None]:
def Main():
    
    count = 0
    for line in open(annotation):
        count +=1
        if count == 1:
            #skip sample column
            conditions = line.strip().split('\t')[1:]
            print(conditions)
        else:
            break
    
    for item in conditions:
        
        gem = loadgem(input, annotation, item)
        res = train(gem, item)
        conf = confusion(gem, item, res)
        feat = extract(res)
        print(feat)

In [None]:
Main()