In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics

df = pd.read_csv('D:/workspace/RStuidoSpace/OS/result/feature/feature_final_1.csv')
df['Metastasis'] = df['Metastasis'].map({'Metastasis': 1, 'No Metastasis': 0})

# print(df.shape)
# print(df.info)

x = df.drop(columns=['ID', 'Metastasis'])
y = df['Metastasis']

scaler = StandardScaler()
x = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=49)

x_train[:3]

# df1 = df
# df1 = df1.drop(columns=['ID'])
# sns.pairplot(data=df1,diag_kind='hist', hue= 'Metastasis')
# plt.show()


In [None]:
## '''random search'''

# log_reg
param_random_lr = {
    'C': np.linspace(0.1,50,50),
    'penalty': ['l1', 'l2','elasticnet'],
    'solver': ['liblinear','lbfgs','saga','newton-cg','sag']
}

# dtree
param_random_dt = {
    'criterion':['gini', 'entropy'],
    'splitter':['best', 'random'],
    'max_depth':np.arange(1,50,1)
}

# svm
param_random_svm = {
    'kernel':['rbf'],
    'C':np.linspace(0.01,50,20),
    'gamma':np.linspace(0.001,10,20)
}

# rf
param_random_rf = {
    "n_estimators":np.arange(1,100,10),
    "max_features":['auto','log2','sqrt'],
    'max_depth':np.arange(1,30,1)
}

log_reg = LogisticRegression()
dtree = DecisionTreeClassifier()
svm = SVC()
rfclass = RandomForestClassifier()

clf = RandomizedSearchCV(estimator = log_reg,
                         param_distributions = param_random_lr,
                         n_iter = 100,
                         scoring = 'accuracy',
                         cv = 5,
                         n_jobs = -1,
                        # random_state=49
                        )
clf.fit(x, y)

print('best classifier:\n', clf.best_estimator_)
print('best score:\n', clf.best_score_)
print('best parameters:\n', clf.best_params_)

In [None]:
'''grid search'''

# log reg
param_grid_lr = {
    'C': np.linspace(0.01,2,10),
    'penalty': ['l2'],
    'solver': ['liblinear']
}

# dtree
param_grid_dt = {
    'criterion':['gini'],
    'splitter':['random'],
    'max_depth':np.arange(1,11,1)
}

# svm
param_grid_svm = {
    'kernel':['rbf'],
    'C':np.linspace(0.01,50,10),
    'gamma':np.linspace(0.001,10,10)
}

# rf 
param_grid_rf = {
    "n_estimators":np.arange(1,100,10),
    "max_features":['auto','log2','sqrt'],
    'max_depth':np.arange(1,30,1)
}

log_reg = LogisticRegression()
dtree = DecisionTreeClassifier()
svm = SVC()
rfclass = RandomForestClassifier()

clf = GridSearchCV(estimator=log_reg, 
                   param_grid= param_grid_lr, 
                   n_jobs=-1,
                   cv=5,
                   scoring='accuracy'
                  )

clf.fit(x, y)

print('best classifier:\n', clf.best_estimator_)
print('best score:\n', clf.best_score_)
print('best parameters:\n', clf.best_params_)

In [None]:
'''cross validation'''

log_reg =  LogisticRegression(C=0.45222222222222225, solver='liblinear')
dtree =  DecisionTreeClassifier(max_depth=10, splitter='random')
svm=SVC(C=5.564444444444445, gamma=1.1119999999999999)
rfclass = RandomForestClassifier(n_estimators=1, max_features=19, criterion='entropy', max_depth=29)

scores1 = cross_val_score(log_reg, x, y, cv=5,scoring="roc_auc")
scores2 = cross_val_score(dtree, x, y, cv=5,scoring="roc_auc")
scores3 = cross_val_score(svm, x, y, cv=5,scoring="roc_auc")
scores4 = cross_val_score(rfclass, x, y, cv=5,scoring="roc_auc")


print("lr: %0.2f (+/- %0.2f)" % (scores1.mean(), scores1.std() * 2))
print("dtree: %0.2f (+/- %0.2f)" % (scores2.mean(), scores2.std() * 2))
print("svm: %0.2f (+/- %0.2f)" % (scores3.mean(), scores3.std() * 2))
print("rf: %0.2f (+/- %0.2f)" % (scores4.mean(), scores4.std() * 2))

In [None]:
'''logistic regression'''

log_reg.fit(x_train,y_train)

# print(x_train)
# print(y_train)

y_pred = log_reg.predict(x_test)

y_probas = log_reg.predict_proba(x_test)

print('The accuracy of the Logistic Regression is:', metrics.accuracy_score(y_test, y_pred))
print('The precision of the Logistic Regression is:',metrics.precision_score(y_test, y_pred))
print('The recall of the Logistic Regression is:', metrics.recall_score(y_test, y_pred))
print('The F1-score of the Logistic Regression is:',metrics.f1_score(y_test, y_pred))
print('The AUC of the Logistic Regression is:',metrics.roc_auc_score(y_test, y_pred))

confusion_matrix_result = metrics.confusion_matrix(y_pred, y_test)
print('The confusion matrix result:\n', confusion_matrix_result)

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_result, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()

# Ranking of importance
coef_LR = pd.DataFrame({'var' : list(df.columns)[1:9],
                        'coef' : log_reg.coef_.flatten()
                        })

index_sort =  np.abs(coef_LR['coef']).sort_values().index
coef_LR_sort = coef_LR.loc[index_sort,:]
#print(coef_LR_sort)

fig,ax=plt.subplots()
xaxis, yaxis = coef_LR_sort['var'], coef_LR_sort['coef']
rects = plt.barh(xaxis, yaxis, color='dodgerblue')
plt.grid(linestyle="-.", axis='y', alpha=0.4)
plt.tight_layout()

for rect in rects:
    w = rect.get_width()
    ax.text(w, rect.get_y()+rect.get_height()/2,'%.2f' %w,ha='left',va='center')


import scikitplot as skplt
skplt.metrics.plot_roc(y_test, y_probas)
plt.show()

In [None]:
'''Zhongshan'''

df1 = pd.read_csv('D:/workspace/RStuidoSpace/OS/result/zhongshan/feature/feature_no_filter_remove.csv')
df1['Metastasis'] = df1['Metastasis'].map({'Metastasis': 1, 'No Metastasis': 0})

x1 = df1.drop(columns=['ID', 'Metastasis'])
y1 = df1['Metastasis']

x1 = scaler.transform(x1)

# print(x1)

log_reg.fit(x_train, y_train)
dtree.fit(x_train, y_train)
svm.fit(x_train, y_train)
rfclass.fit(x_train, y_train)

y_prediction = log_reg.predict(x1)
# y_prediction = dtree.predict(x1)
# y_prediction = svm.predict(x1)
# y_prediction = rfclass.predict(x1)

print(y_prediction)

#report = classification_report(y, y_prediction)
#print(report)

print('The accuracy of the LR is:', metrics.accuracy_score(y1, y_prediction))
print('The precision of the LR is:',metrics.precision_score(y1, y_prediction))
print('The recall of the LR is:', metrics.recall_score(y1, y_prediction))
print('The F1-score of the LR is:',metrics.f1_score(y1, y_prediction))
#print('The AUC of the Logistic Regression is:',roc_auc_score(y, y_prediction))

confusion_matrix_result = metrics.confusion_matrix(y_prediction, y1)
print('The confusion matrix result:\n', confusion_matrix_result)

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_result, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()