In [1]:
import matplotlib.pyplot as plt
import numpy as np
import csv
from sklearn import svm, metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [2]:
myfile = np.genfromtxt('/home/hju/advanceBio/largedata/data_mirna_rna_onlylabel.csv', delimiter=",", dtype=int) # cols: label, ID, race, mirnas...
data = myfile[1:, 1:]  # data: n_samples * n_features array; LAST COL OF CSV IS nan (after the last comma)
labels = myfile[1:, 0]

In [5]:
for i in range(5):
    print("********************** %d **********************" % i)
    data_train, data_test, label_train, label_test = train_test_split(data, labels, test_size=0.25, random_state=i, shuffle=True)

    count_dead_train = 0
    count_dead_test = 0
    for k in range(label_train.shape[0]):
        if label_train[k] == 1:
            count_dead_train += 1
    for k in range(label_test.shape[0]):
        if label_test[k] == 1:
            count_dead_test += 1
    
    print("total sample: %d" % (label_train.shape[0]+label_test.shape[0]))
    sz1 = label_train.shape[0]/(label_train.shape[0]+label_test.shape[0])
    sz2 = label_test.shape[0]/(label_train.shape[0]+label_test.shape[0])
    print("training dataset: %.2f, testing dataset: %.2f" % (sz1, sz2))
    sz0 = (count_dead_train+count_dead_test)/(label_train.shape[0]+label_test.shape[0])
    sz1 = count_dead_train/label_train.shape[0]
    sz2 = count_dead_test/label_test.shape[0]
    print("dead in total dataset: %.2f" % sz0)
    print("dead in training dataset: %.2f, dead in testing dataset: %.2f" % (sz1, sz2))
    print()

    #================ Logistic Regression ==================
    logr = LogisticRegression(solver='lbfgs', n_jobs = 2)
    logr.fit(data_train, label_train)
    # predicted = logr.predict(data_test)
    # logr.predict_proba(X[:2, :])
    logr_score = logr.score(data_test, label_test)

    print ('=== Logistic Regression ===')
    print(f'score: {logr_score}')
    # print(f'expected: {label_test}')
    # print(f'predicted: {predicted}')
    print()

    #================== SVM =============================
    svcc = svm.SVC(kernel = 'rbf')
    svcc.fit(data_train, label_train)
    # predicted = classifier.predict(data_test)
    # expected = labels[-num_test:]
    svcc_score = svcc.score(data_test, label_test)

    print ('=== SVC rbf ===')
    # print("Classification report for classifier %s:\n%s\n"
    #       % (classifier, metrics.classification_report(label_test, predicted)))
    # print("Confusion matrix:\n%s" % metrics.confusion_matrix(label_test, predicted))
    # print(f'expected: {label_test}')
    # print(f'predicted: {predicted}')
    print(f'score: {svcc_score}')
    print()
    #------------------------------------------------
    svcc = svm.SVC(kernel = 'linear')
    svcc.fit(data_train, label_train)
    # predicted = svc.predict(data_test)
    # expected = labels[-num_test:]
    svcc_score = svcc.score(data_test, label_test)

    print ('=== SVC linear ===')
    # print("Classification report for classifier %s:\n%s\n"
    #       % (classifier, metrics.classification_report(label_test, predicted)))
    # print("Confusion matrix:\n%s" % metrics.confusion_matrix(label_test, predicted))
    # print(f'expected: {label_test}')
    # print(f'predicted: {predicted}')
    print(f'score: {svcc_score}')
    print()
    #------------------------------------------------
    svcc = svm.SVC(kernel = 'poly', degree = 3)
    svcc.fit(data_train, label_train)
    svcc_score = svcc.score(data_test, label_test)
    # predicted = svc.predict(data_test)
    # expected = labels[-num_test:]

    print ('=== SVC poly 3 ===')
    # print("Classification report for classifier %s:\n%s\n"
    #       % (classifier, metrics.classification_report(label_test, predicted)))
    # print("Confusion matrix:\n%s" % metrics.confusion_matrix(label_test, predicted))
    # print(f'expected: {label_test}')
    # print(f'predicted: {predicted}')
    print(f'score: {svcc_score}')
    print()

    #======================= Random Forest =========================
    rf = RandomForestClassifier(n_estimators=10)
    #clf = RandomForestClassifier(n_estimators= 10)
    rf = rf.fit(data_train, label_train)
    # predicted = clf.predict(data_test)
    rf_score = rf.score(data_test, label_test)

    # importances = clf.feature_importances_
    # std = np.std([tree.feature_importances_ for tree in clf.estimators_],
    #              axis=0)
    # indices = np.argsort(importances)[::-1]

    print ('=== Random Forest ===')
    # print(f'expected: {label_test}')
    # print(f'predicted: {predicted}')
    print(f'score: {rf_score}')
    print()

    # # Plot the feature importances of the forest
    # plt.figure()
    # plt.title("RF Feature importances")
    # plt.bar(range(data_train.shape[1]), importances[indices],
    #        color="r", yerr=std[indices], align="center")
    # plt.xticks(range(data_train.shape[1]), indices)
    # plt.xlim([-1, data_train.shape[1]])
    # # plt.savefig("RF_importance.png")
    # plt.show()



********************** 0 **********************
total sample: 942
training dataset: 0.75, testing dataset: 0.25
dead in total dataset: 0.14
dead in training dataset: 0.15, dead in testing dataset: 0.12

=== Logistic Regression ===
score: 0.8177966101694916





=== SVC rbf ===
score: 0.8771186440677966

=== SVC linear ===
score: 0.788135593220339





=== SVC poly 3 ===
score: 0.7923728813559322

=== Random Forest ===
score: 0.8771186440677966

********************** 1 **********************
total sample: 942
training dataset: 0.75, testing dataset: 0.25
dead in total dataset: 0.14
dead in training dataset: 0.15, dead in testing dataset: 0.14

=== Logistic Regression ===
score: 0.809322033898305





=== SVC rbf ===
score: 0.8601694915254238

=== SVC linear ===
score: 0.788135593220339





=== SVC poly 3 ===
score: 0.7923728813559322

=== Random Forest ===
score: 0.8686440677966102

********************** 2 **********************
total sample: 942
training dataset: 0.75, testing dataset: 0.25
dead in total dataset: 0.14
dead in training dataset: 0.14, dead in testing dataset: 0.17

=== Logistic Regression ===
score: 0.7838983050847458





=== SVC rbf ===
score: 0.8347457627118644

=== SVC linear ===
score: 0.7838983050847458





=== SVC poly 3 ===
score: 0.809322033898305

=== Random Forest ===
score: 0.8305084745762712

********************** 3 **********************
total sample: 942
training dataset: 0.75, testing dataset: 0.25
dead in total dataset: 0.14
dead in training dataset: 0.14, dead in testing dataset: 0.15

=== Logistic Regression ===
score: 0.7796610169491526





=== SVC rbf ===
score: 0.8516949152542372

=== SVC linear ===
score: 0.7796610169491526





=== SVC poly 3 ===
score: 0.7627118644067796

=== Random Forest ===
score: 0.8347457627118644

********************** 4 **********************
total sample: 942
training dataset: 0.75, testing dataset: 0.25
dead in total dataset: 0.14
dead in training dataset: 0.15, dead in testing dataset: 0.14

=== Logistic Regression ===
score: 0.7966101694915254





=== SVC rbf ===
score: 0.864406779661017

=== SVC linear ===
score: 0.8008474576271186





=== SVC poly 3 ===
score: 0.8220338983050848

=== Random Forest ===
score: 0.8686440677966102

