In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.optimize as opt
from sklearn.metrics import roc_auc_score,roc_curve,plot_roc_curve,auc,accuracy_score,precision_score,recall_score,f1_score
from  sklearn import linear_model
import itertools
import math
import time
import warnings
warnings.filterwarnings("ignore")

In [None]:
infile = 'model.feature.csv'
indf = pd.read_csv(infile)
print(indf.shape)
indf.head()
degfile = 'model.deg.csv'
degdf = pd.read_csv(degfile)
print(degdf.shape)
print(degdf)
deg = degdf['gene_symbol'].tolist()
deg

In [None]:
rcpt = ['IGH','IGK','IGL','TRA','TRB']
richness = [i+'_richness' for i in rcpt]
diversity = [i+'_diversity' for i in rcpt]
clonality = [i+'_clonality' for i in rcpt]
#################################################################
feature_list = diversity + richness + clonality + deg
max_comb = 5
#################################################################
outfile = 'cv_model.immune+deg.max_comb'+str(max_comb)+'.results.csv'
print(outfile)
#############################################################################
#进度条
immun_num = 0
for x in range(1,max_comb+1):
    immun_num += math.factorial(len(feature_list))/math.factorial(x)/math.factorial(len(feature_list)-x)
total_num = immun_num
print('total combs: {}'.format(total_num))
one_percent = total_num // 100
percents = []
for x in range(1,101):
    percents.append(x*one_percent)
#######################################################################
running = 0
with open(outfile, 'w') as out:
    header = 'tag,feature_number,auc,accuracy,precision,recall,f1'
    out.write(header+'\n')
    for n in range(1,max_comb+1):
        combs = list(itertools.combinations(feature_list,n))
        for comb in combs:   
            running += 1
            if running in percents:
                print('running: {0:2}%, No.{1:<8}, time:{2}'.format(percents.index(running)+1,running,time.ctime()))
            model_feature = list(comb)
            tag = ''
            for feature in list(comb):
                tag = tag + '+' + feature
            tag = tag[1:]
        #######################################
            allauc=[]; accuracy=[]; precision=[]; recall=[]; f1=[]
            for cv in ['cv1','cv2','cv3']:
                traindf = indf[indf[cv]=='train']
                testdf = indf[indf[cv]=='test']
                trainx = traindf.loc[:,model_feature]
                trainy = traindf.loc[:,'type']
                testx = testdf.loc[:,model_feature]
                testy = testdf.loc[:,'type']
                lr = linear_model.LogisticRegression()
                lr.fit(trainx, trainy)
                test_prob = lr.predict_proba(testx)
                allauc.append(roc_auc_score(testy, test_prob, multi_class='ovr'))
                accuracy.append(accuracy_score(testy, lr.predict(testx)))
                precision.append(precision_score(testy, lr.predict(testx), average='macro'))
                recall.append(recall_score(testy, lr.predict(testx), average='macro'))
                f1.append(f1_score(testy, lr.predict(testx), average='macro'))
            feature_number = len(model_feature)
            line = tag+','+str(feature_number)+','+str(np.mean(allauc))+','+str(np.mean(accuracy))+','+str(np.mean(precision))+','+str(np.mean(recall))+','+str(np.mean(f1))
            out.write(line+'\n')

In [None]:
# validation best_model
n_best = {'immune':20,'deg':7,'immune+deg':469}
feature_comb = 'immune+deg'
feature_comb_file = 'cv_model.'+feature_comb+'.max_comb5.results.csv'
combdf = pd.read_csv(feature_comb_file)
tag_list = combdf.loc[:n_best[feature_comb]-1,'tag']
print(tag_list)
infile = 'model.feature.csv'
indf = pd.read_csv(infile)
print(indf.shape)
# indf.head()

In [None]:
outfile = 'model.best_validation.'+feature_comb+'.csv'
with open(outfile, 'w') as out:
    header = 'tag,feature_number,auc,accuracy,precision,recall,f1'
    out.write(header+'\n')
    for tag in tag_list:
        model_feature = tag.split('+')
        cv = 'cv1'
        traindf = indf[indf[cv] != 'validation']
        testdf = indf[indf[cv] == 'validation']
        trainx = traindf.loc[:,model_feature]
        trainy = traindf.loc[:,'type']
        testx = testdf.loc[:,model_feature]
        testy = testdf.loc[:,'type']
        lr = linear_model.LogisticRegression()
        lr.fit(trainx, trainy)
        test_prob = lr.predict_proba(testx)
        allauc = roc_auc_score(testy, test_prob, labels=lr.classes_, multi_class='ovr')
        accuracy = accuracy_score(testy, lr.predict(testx))
        precision = precision_score(testy, lr.predict(testx), average='macro')
        recall = recall_score(testy, lr.predict(testx), average='macro')
        f1 = f1_score(testy, lr.predict(testx), average='macro')
        feature_number = len(model_feature)
        line = '{0},{1},{2},{3},{4},{5},{6}'.format(tag,feature_number,allauc,accuracy,precision,recall,f1)
        out.write(line+'\n')
