In [None]:
# importing required libraries 
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, auc,confusion_matrix ,precision_recall_curve,f1_score,accuracy_score
from sklearn.model_selection import StratifiedKFold
from itertools import cycle
from scipy import interp
from sklearn.externals import joblib
from matplotlib.colors import ListedColormap 
from sklearn.model_selection import train_test_split
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns

In [None]:
#Loading the dataset 
dataset = pd.read_csv('data_new.csv') 

print(dataset.columns)


In [None]:
dataset[['Sex']]=dataset[['Gender']]

In [None]:
# Determin features and targets

X=dataset[['Height', 'Weight', 'Age','BMI', 'Body Area','Sex','FeNO']]  # Features
y=dataset['Species']  # Labels


In [None]:
#Correlations between features
cor_matrix = X.corr()



In [None]:
# Plot heat map
def plot_examples(colormaps):
    """
    Helper function to plot data with associated colormap.
    """
    
    plotdata = cor_matrix
    n = len(colormaps)
    fig, axs = plt.subplots(1, n, figsize=(n * 2 +4, 4),
                            constrained_layout=True, squeeze=False)
    for [ax, cmap] in zip(axs.flat, colormaps):
        psm = ax.pcolormesh(plotdata, cmap=cmap, rasterized=True, vmin=-1, vmax=1)
        fig.colorbar(psm, ax=ax)
        
    #plt.subplots.yticklabels(fontdict={ 'size'   : 7})
    plt.gca().set_xticks(np.linspace(0,7,7))
    plt.gca().set_yticks(np.linspace(0,7,7))    
    plt.gca().set_xticklabels(['Height', 'Weight', "Age", "BMI", 'Body Area', 'Sex', "FeNO"],fontproperties = 'Arial')
    plt.xticks(rotation=15)
    plt.gca().set_yticklabels(['Height', 'Weight', "Age", "BMI", 'Body Area', 'Sex', "FeNO"],fontproperties = 'Arial')
    #plt.yticks(rotation=5) 
    plt.savefig('./heatmap.tif',dpi=600)
    plt.show()
    

In [None]:
N = 256
vals = np.ones((N, 4))
vals1 = np.ones((N, 4))
vals[:, 0] = np.linspace(178/256, 1, N)
vals[:, 1] = np.linspace(2/256, 1, N)
vals[:, 2] = np.linspace(4/256, 1, N)
vals1[:, 0] = np.linspace(1 ,6/256, N)
vals1[:, 1] = np.linspace(1,38/256,  N)
vals1[:, 2] = np.linspace(1,137/256,  N)


newcolors = np.vstack((vals, vals1))
newcmp = ListedColormap(newcolors, name='OrangeBlue')
plot_examples([newcmp])


In [None]:

#Create a random forest classifier
clf=RandomForestClassifier(n_estimators=1000,class_weight={0:1000,1:1},random_state=200)

#Train the model use total set to found the feature importance
clf.fit(X,y)


y_pred= clf.predict(X) 
#print the confusion matrix   
cm=confusion_matrix(y,y_pred)

print(cm)    
    

In [None]:
#feature importance
feature_imp = pd.Series(clf.feature_importances_,index=X.columns[0:7]).sort_values(ascending=False)
feature_imp

In [None]:

#Plot the feature importance
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index,palette=sns.light_palette((6/256.0,38/256.0,137/256.0),n_colors = 7, reverse = True))

# Add labels to your graph

plt.xlabel('Feature Importance Score')

plt.ylabel('Features')

plt.tight_layout(pad=0.05, w_pad=0.00, h_pad=0.0)
plt.savefig('./character.tif',dpi=600)
plt.show()

In [None]:
X=dataset[[ 'Age','BMI', 'Body Area','FeNO']]  # Features
y=dataset['Species']  # Labels


In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=40,shuffle=True) # 70% training and 30% test

In [None]:
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)

cv = StratifiedKFold(n_splits=10,random_state=1)
lw=2
##### k折交叉验证
i=0
k=0
for train, test in cv.split(X_train, y_train):
    Xtrain, Xtest = X_train.iloc[list(train)], X_train.iloc[list(test)]
    ytrain, ytest = y_train.iloc[list(train)], y_train.iloc[list(test)]
    #print(ytest)
    probas_ = clf.fit(Xtrain, ytrain).predict_proba(Xtest)
    ypredicted=clf.predict_proba(X)
    #模型和数据保存
    namecsv1=str(k)+'xtrain.csv'
    namecsv2=str(k)+'xtest.csv'
    namecsv3=str(k)+'ytrain.csv'
    namecsv4=str(k)+'ytest.csv'
    namecsv5=str(k)+'ypredicted.csv'
    namepkl =str(k)+'.pkl'
    
    X1 = pd.DataFrame( Xtrain)
    X2 = pd.DataFrame( Xtest)
    X3 = pd.DataFrame( ytrain)
    X4 = pd.DataFrame( ytest)
    X5 = pd.DataFrame( ypredicted)
    X1.to_csv('save6/'+namecsv1,header=True)
    X2.to_csv('save6/'+namecsv2,header=True)
    X3.to_csv('save6/'+namecsv3,header=True)
    X4.to_csv('save6/'+namecsv4,header=True)
    X5.to_csv('save6/'+namecsv5,header=True)
    
    name =str(k)+'.pkl'
    #保存Model(注:save文件夹要预先建立，否则会报错)
    joblib.dump(clf, 'save6/'+name)
    k+=1
    
    # Compute ROC curve and area the curve
    #　注意这里返回的阈值，以区分正负样本的阈值
    fpr, tpr, thresholds = roc_curve(ytest, probas_[:, 1])
    #进行插值
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    
    
    ytrain_pred=clf.predict(Xtrain)
    #Predict the response for test dataset
    ypred = clf.predict(Xtest)
    
    cm_train=confusion_matrix(ytrain,ytrain_pred)
    cm = confusion_matrix(ytest, ypred)
    #print(cm_train)
    #print(cm)    
    
    plt.plot(fpr, tpr, lw=lw,
             label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
 
    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k',
         label='Luck')
 
mean_tpr /= cv.get_n_splits(X_train, y_train)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)
 
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
clf3 = joblib.load('save6/4.pkl')

In [None]:
y_train_pred= clf3.predict_proba(X_train[[ 'Age','BMI', 'Body Area','FeNO']])


X_set=X_train[['FeNO']].values
y_set=y_train.values.tolist()
#print(type(y_test))
#y_full_pre= clf3.predict(X)


for i in range(y_train.shape[0]): 
    #print(ytest[i,1])
    plt.scatter(y_train_pred[i, 1], X_set[i,0],c = ListedColormap(('Blue', 'red', 'blue'))( y_set[i]))   
        

plt.plot([0.5, 0.5], [0.0, 70.0], linestyle='--', lw=lw, color='k')
 

plt.xlim([-0.05, 1.05])
plt.ylim([0.0, 70.0])
plt.xlabel('Predicted value for COVID-19')
plt.ylabel('FeNO (ppb)')
plt.title('Training set')
plt.tight_layout(pad=0.0, w_pad=0.00, h_pad=0.0)
plt.savefig('./trainset.tif',dpi=600)
plt.legend(loc="lower right")
plt.show()

In [None]:
clf3 = joblib.load('save6/4.pkl')
y_test_pred= clf3.predict_proba(X_test[[ 'Age','BMI', 'Body Area','FeNO']])


X_set=X_test[['FeNO']].values
y_set=y_test.values.tolist()
#print(type(y_test))
#y_full_pre= clf3.predict(X)
data1 = pd.DataFrame(y_test_pred)
data1.to_csv('predict.csv',header=True)

for i in range(y_test.shape[0]): 
    #print(ytest[i,1])
    plt.scatter(y_test_pred[i, 1], X_set[i,0],c = ListedColormap(('Blue', 'red', 'blue'))( y_set[i]))   
        

plt.plot([0.5, 0.5], [0.0, 70.0], linestyle='--', lw=lw, color='k')
 

plt.xlim([-0.05, 1.05])
plt.ylim([0.0, 70.0])
plt.xlabel('Predicted value for COVID-19')
plt.ylabel('FeNO (ppb)')
plt.title('Training set')
plt.tight_layout(pad=0.0, w_pad=0.00, h_pad=0.0)
plt.savefig('./trainset.tif',dpi=600)
plt.legend(loc="lower right")
plt.show()

In [None]:


probas_ = clf3.predict_proba(X_test)

fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
#进行插值
y_test_pred=clf3.predict(X_test)
roc_auc = auc(fpr, tpr)  
    
f1=f1_score(y_test, y_test_pred, labels=None, pos_label=1, average='binary', sample_weight=None)
print("f1 score= ",f1)  
plt.plot(fpr, tpr, lw=2,
            label='(AUC = %0.3f)' % ( roc_auc),color=(6/256.0,38/256.0,137/256.0))
 

plt.plot([0, 1], [0, 1], linestyle='--', lw=0.5, color='k')
  
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
#plt.title('Receiver operating characteristic')
plt.tight_layout(pad=0.0, w_pad=0.00, h_pad=0.0)
#plt.text(0.9,4.5,name[j],fontdict={ 'size'   : 7})

plt.legend(loc="lower right")
plt.savefig('./ROC_Curve.tif',dpi=600)
plt.show()

In [None]:
totaltestset= pd.read_excel('test_total.xlsx') 

In [None]:
X_test=totaltestset[[ 'Age','BMI', 'Body Area','FeNO']]
y_test=totaltestset[[ 'Species']]

In [None]:
y_test_pred= clf3.predict_proba(X_test[[ 'Age','BMI', 'Body Area','FeNO']])


X_set=X_test[['FeNO']].values
y_set=y_test.values.tolist()
#print(type(y_test))
#y_full_pre= clf3.predict(X)
data1 = pd.DataFrame(y_test_pred)
data1.to_csv('predict.csv',header=True)

for i in range(y_test.shape[0]): 
    #print(ytest[i,1])
    plt.scatter(y_test_pred[i, 1], X_set[i,0],c = ListedColormap(('Blue', 'red', 'blue'))( y_set[i]))   
        

plt.plot([0.5, 0.5], [0.0, 70.0], linestyle='--', lw=lw, color='k')
 

plt.xlim([-0.05, 1.05])
plt.ylim([0.0, 70.0])
plt.xlabel('Predicted value for COVID-19')
plt.ylabel('FeNO (ppb)')
plt.title('Training set')
plt.tight_layout(pad=0.0, w_pad=0.00, h_pad=0.0)
plt.savefig('./trainset.tif',dpi=600)
plt.legend(loc="lower right")
plt.show()