In [136]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.utils import resample
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import precision_recall_curve

In [137]:
df1 = pd.read_csv('csv_result-Descriptors_Training.csv')
df2 = pd.read_csv('csv_result-Descriptors_Calibration.csv')
print(df1.shape)
print(df2.shape)
df1.isnull().any() #null value detection

(19988, 30)
(4996, 30)


id                    False
IP_ES_25_N1           False
Z3_IB_4_N1            False
Z1_IB_10_N1           False
Z1_IB_5_N1            False
Z3_IB_8_N1            False
ECI_IB_4_N1           False
ECI_IB_5_N1           False
Gs(U)_IB_12_N1        False
Gs(U)_IB_68_N1        False
Gs(U)_IB_58_N1        False
Gs(U)_IB_60_N1        False
Z1_NO_sideL35_M       False
HP_NO_sideL35_CV      False
Z1_NO_sideR35_CV      False
Pb_NO_sideR35_S       False
IP_NO_sideL35_SI71    False
Z1_NO_PRT_CV          False
Z2_NO_AHR_CV          False
Gs(U)_NO_ALR_SI71     False
Z3_NO_UCR_S           False
Z3_NO_UCR_N1          False
ECI_NO_UCR_CV         False
Pa_NO_BSR_SI71        False
ISA_NO_NPR_S          False
Z3_NO_NPR_V           False
IP_NO_PLR_S           False
Pb_NO_PCR_V           False
ECI_NO_PCR_CV         False
class                 False
dtype: bool

In [138]:
df1.isnull().any()

id                    False
IP_ES_25_N1           False
Z3_IB_4_N1            False
Z1_IB_10_N1           False
Z1_IB_5_N1            False
Z3_IB_8_N1            False
ECI_IB_4_N1           False
ECI_IB_5_N1           False
Gs(U)_IB_12_N1        False
Gs(U)_IB_68_N1        False
Gs(U)_IB_58_N1        False
Gs(U)_IB_60_N1        False
Z1_NO_sideL35_M       False
HP_NO_sideL35_CV      False
Z1_NO_sideR35_CV      False
Pb_NO_sideR35_S       False
IP_NO_sideL35_SI71    False
Z1_NO_PRT_CV          False
Z2_NO_AHR_CV          False
Gs(U)_NO_ALR_SI71     False
Z3_NO_UCR_S           False
Z3_NO_UCR_N1          False
ECI_NO_UCR_CV         False
Pa_NO_BSR_SI71        False
ISA_NO_NPR_S          False
Z3_NO_NPR_V           False
IP_NO_PLR_S           False
Pb_NO_PCR_V           False
ECI_NO_PCR_CV         False
class                 False
dtype: bool

In [139]:
def data_cleaning(df): #Implemention of zcore method
    column_name = list(df1)[1:-1]
    d = []
    df_sum = df1.copy() #make a copy of df1
    df_zscore = df1.copy() #make a copy of df1 to store the zscore
    for i in column_name:
        df_zscore[i] = (df1[i] - df1[i].mean())/df1[i].std()
        b = df_zscore[df_zscore[i].abs()>3]['id']
        d.append(list(b-1))
    d = sum(d, [])
    df_sum = df_sum.drop(df_sum.index[d])
    return df_sum
df_clean = data_cleaning(df1)
df_clean.shape

(17107, 30)

In [140]:
df1.shape[0] - df_clean.shape[0] #number of bad data

2881

In [141]:
df_clean.loc[df_clean['class']=='P'].shape[0] #number of postive data

905

In [142]:
df_clean.loc[df_clean['class']=='N'].shape[0] #number of negative data

16202

In [143]:
#We can see that there is a serious data imbalance in the data set. So we take the upsampling method for the positive data.
def upsample():
    df_sort = df_clean.sort_values(by = 'class', ascending = False) #Put positive data in the first 1042 rows
    X_minor = df_sort.iloc[:905,:]
    X_major = df_sort.iloc[905:,:]
    df_minority_upsampled = resample(X_minor, 
                                     replace=True, # sample with replacement
                                     n_samples=16202, # to match majority class
                                     random_state=123) # reproducible results
    df_upsampled = pd.concat([df_minority_upsampled, X_major])
    return df_upsampled
df_upsampled = upsample()

In [144]:
df_upsampled.loc[df_upsampled['class']=='P'].shape[0] #the number of positive data after upsampling

16202

In [145]:
X_train = df_upsampled.iloc[:,1:29]
X_test = df2.iloc[:,1:29]
y_train = df_upsampled.iloc[:,29]
y_test = df2.iloc[:,29]
X_train.shape

(32404, 28)

In [148]:
#We are going to use a Bayesian classifier, so we need to remove features with high relevance
#If the pearson coefficient is greater than 0.3, we consider it to be highly relevant.
def redundancy(): #Calculate relavance.Choose features with pearson coefficient greater than 0.3.
    corr =  np.asarray(X_train.corr())
    Redundancy_coef = np.zeros(28)
    for i in range(28):
        Redundancy_coef[i] = np.mean(np.absolute(corr[:,i]))
        for j in range(i+1,28):
            if corr[j,i] >= 0.3:
                print(i,j)
redundancy()

5 6
8 23
9 10
9 23
10 23
19 20
19 21
20 21


In [149]:
#Remove features with high relevance
del_col = [5,9,19,21,23]
X_train = X_train.drop(X_train.columns[del_col],axis=1)
X_test = X_test.drop(X_test.columns[del_col],axis=1)
print(X_train.shape, X_test.shape)

(32404, 23) (4996, 23)


In [150]:
#Calculate the correlation between each feature and label. 
#Then calculate the value of (relavance-redundancy). 
#Finally we sorted the 23 features.
X_train1 = np.asarray(X_train)
X_test1 = np.asarray(X_test)
y_train1 = np.asarray(y_train)
y_test1 = np.asarray(y_test)
for i in range(len(y_train1)):
    if y_train1[i] == ('P'):
        y_train1[i] = 1
    else:
        y_train1[i] = 0
for i in range(len(y_test1)):
    if y_test1[i] == ('P'):
        y_test1[i] = 1
    else:
        y_test1[i] = 0
def sort_list():
    Relavance_coef = np.zeros(23)
    Redundancy_coef = np.zeros(23)
    for i in range(23):
        Relavance_coef[i], p = stats.pearsonr(X_train1[:,i], y_train1)    
    Relavance_abs = np.absolute(Relavance_coef)
    corr =  np.asarray(X_train.corr())
    for i in range(23):
        Redundancy_coef[i] = np.mean(np.absolute(corr[:,i]))
    Redundancy_abs = np.absolute(Redundancy_coef)
    RR = Relavance_abs-Redundancy_abs

    ind = np.argsort(-RR)
    RR_sort = RR[np.argsort(-RR)]
    return ind
sort_list = sort_list()
sort_list

array([20, 12, 21,  7, 17, 11, 14, 15,  9,  2, 18,  8, 10,  5,  0,  4,  3,
        6,  1, 19, 22, 13, 16], dtype=int64)

In [160]:
y_train1 = np.asarray(y_train1,dtype='int')
y_test1 = np.asarray(y_test1,dtype='int')
y_test1 

array([1, 1, 1, ..., 0, 0, 0])

In [177]:
# Train a Bayesian classifier
new_feature_train = X_train1[:,sort_list]
new_feature_test = X_test1[:,sort_list]
clf = GaussianNB()
marks = []
for i in range(23):
    X_train_new1 = new_feature_train[:,:i+1] # input how many features
    X_test_new1 = new_feature_test[:,:i+1]
    clf.fit(X_train_new1,y_train1)
    scores = clf.score(X_test_new1, y_test1)
    marks.append(scores)
print(max(marks))

0.5578462770216173


In [183]:
# We found that accuracy was very low, so we tried to integrate the classifier using ensemble learning methods.
mark_dict = {}
for i in range(23):
    X_train_new1 = new_feature_train[:,:i+1] # input how many features
    X_test_new1 = new_feature_test[:,:i+1]
    bdt = AdaBoostClassifier(clf,n_estimators=50)
    bdt.fit(X_train_new1,y_train1)
    scores1 = bdt.score(X_test_new1, y_test1)
    mark_dict[i+1] = scores1
a = sorted(mark_dict.items(), key=lambda x: x[1],reverse = True)
print("When there are {0} features, the accuracy can reach to {1}".format(a[0][0],a[0][1]))

When there are 21 features, the accuracy can reach to 0.9423538831064852
