# Positive

In [1]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pandas as pd 
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb
import pickle
import time

from sklearn.decomposition import NMF
from sklearn.cluster import FeatureAgglomeration
from sklearn.preprocessing import StandardScaler
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA, KernelPCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold

In [2]:
# read data
with open('../data/HCD35_pos.pickle', mode='rb') as fp:
    df_3 = pickle.load(fp)
    
with open('../data/HCD45_pos.pickle', mode='rb') as fp:
    df_4 = pickle.load(fp)

with open('../data/HCD65_pos.pickle', mode='rb') as fp:
    df_6 = pickle.load(fp)

In [3]:
print(df_3.shape)
print(df_4.shape)
print(df_6.shape)

(1676, 39604)
(1676, 45702)
(1676, 45286)


In [4]:
# just data add to list
features = [
    df_3.drop('Subclass', axis=1),
    df_4.drop('Subclass', axis=1),
    df_6.drop('Subclass', axis=1)
]

In [5]:
t = pd.DataFrame()
for i in features:
    n_comp = 5
    
    # tSVD
    start = time.time()
    tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
    tsvd_results = tsvd.fit_transform(i)
    elapsed_time = time.time() - start
    print('tSVD: '+str(elapsed_time))

    # PCA
    start = time.time()
    pca = PCA(n_components=n_comp, random_state=420)
    pca_results = pca.fit_transform(i)
    elapsed_time = time.time() - start
    print('PCA: '+str(elapsed_time))
    
    # ICA
    start = time.time()
    ica = FastICA(n_components=n_comp, random_state=420)
    ica_results = ica.fit_transform(i)
    elapsed_time = time.time() - start
    print('ICA: '+str(elapsed_time))

    # GRP
    start = time.time()
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
    grp_results = grp.fit_transform(i)
    elapsed_time = time.time() - start
    print('GRP: '+str(elapsed_time))

    # SRP
    start = time.time()
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
    srp_results = srp.fit_transform(i)
    elapsed_time = time.time() - start
    print('SRP: '+str(elapsed_time))
    
    # KPCA
    start = time.time()
    kpca = KernelPCA(n_components=n_comp, random_state=420)
    kpca_results = kpca.fit_transform(i)
    elapsed_time = time.time() - start
    print('KPCA: '+str(elapsed_time))
    
    # TSNE
    start = time.time()
    tsne = TSNE(n_components=3, random_state=420) # ValueError: 'n_components' should be inferior to 4 for the barnes_hut algorithm as it relies on quad-tree or oct-tree.
    tsne_results = tsne.fit_transform(i)
    elapsed_time = time.time() - start
    print('TSNE: '+str(elapsed_time))
    
    # NMF
    start = time.time()
    nmf = NMF(n_components=n_comp, random_state=420)
    nmf_results = nmf.fit_transform(i)
    elapsed_time = time.time() - start
    print('NMF: '+str(elapsed_time))

    # FAG
    start = time.time()
    fag = FeatureAgglomeration(n_clusters=n_comp)
    fag_results = fag.fit_transform(i)
    elapsed_time = time.time() - start
    print('FAG: '+str(elapsed_time))
    
    # merge each data 
    t = pd.concat([
        t, 
        pd.DataFrame(tsvd_results),
        pd.DataFrame(pca_results),
        pd.DataFrame(ica_results),
        pd.DataFrame(grp_results),
        pd.DataFrame(srp_results),
        pd.DataFrame(kpca_results),
        pd.DataFrame(tsne_results),
        pd.DataFrame(nmf_results),
        pd.DataFrame(fag_results),
    ], axis=1)

tSVD: 3.3301501274108887
PCA: 5.494146823883057
ICA: 10.612626552581787
GRP: 0.2862422466278076
SRP: 0.07525801658630371
KPCA: 2.7598488330841064
TSNE: 251.5755798816681
NMF: 24.193987607955933
FAG: 1030.0934963226318
tSVD: 4.220425128936768
PCA: 4.974258184432983
ICA: 10.118225574493408
GRP: 0.35569000244140625
SRP: 0.15075087547302246
KPCA: 1.7942743301391602
TSNE: 296.9628539085388
NMF: 11.03151822090149
FAG: 1264.1765534877777
tSVD: 4.292203187942505
PCA: 5.890632629394531
ICA: 12.176451921463013
GRP: 0.3742859363555908
SRP: 0.09471917152404785
KPCA: 2.730546236038208
TSNE: 268.9573588371277
NMF: 21.480488777160645
FAG: 1333.0106706619263


In [6]:
f = [
    'tSVD', 'PCA', 'ICA','GRP',
    'SRP', 'KPCA'
]

# make column's name
v = []
for m in [3, 4, 6]:
    for i in f:
        for l in range(n_comp):
            v.append(i+'_'+str(l)+'_'+str(m))

p=['NMF', 'FAG']
q = []
for m in [3, 4, 6]:
    for i in p:
        for l in range(n_comp):
            q.append(i+'_'+str(l)+'_'+str(m))

p=['TSNE']
tsne = []
for m in [3, 4, 6]:
    for i in p:
        for l in range(3):
            tsne.append(i+'_'+str(l)+'_'+str(m))
    
    
t.columns = v+ tsne +q

# to csv
f = pd.concat([ df_3.Subclass, t], axis=1)
f.to_csv('../data/decomp_pos.csv')

In [7]:
# f=f.drop(['UMAP_0_3', 'UMAP_0_4', 'UMAP_0_6'], axis=1)

In [8]:
objective = f.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)

features = f.drop('Subclass', axis=1)

In [9]:
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [10]:
clf = rf()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.6101190476190477

In [11]:
##### X Num 

g = {
    'n_comp':[1, 3, 5, 10, 20, 50, 100, 500,1000],
    'Accuracy':[
        0.7023809523809523, 0.7083333333333334, 0.7232142857142857, 0.7113095238095238, 
        0.7113095238095238, 0.6904761904761905, 0.6934523809523809, 0.6488095238095238, 0.6101190476190477
    ]
}
# n_comp 1 0.7023809523809523
# n_comp 3 0.7083333333333334
# n_comp 5 0.7232142857142857
# n_comp 10 0.7113095238095238
# n_comp 20 0.7113095238095238
# n_comp 50 0.6904761904761905
# n_comp 100 0.6934523809523809
# n_comp 500 0.6488095238095238
# n_comp 1000 0.6101190476190477

In [12]:
pd.DataFrame(g)

Unnamed: 0,n_comp,Accuracy
0,1,0.702381
1,3,0.708333
2,5,0.723214
3,10,0.71131
4,20,0.71131
5,50,0.690476
6,100,0.693452
7,500,0.64881
8,1000,0.610119


# Negative

In [13]:
with open('../data/HCD35_neg.pickle', mode='rb') as fp:
    df_3 = pickle.load(fp)
    
with open('../data/HCD45_neg.pickle', mode='rb') as fp:
    df_4 = pickle.load(fp)

with open('../data/HCD65_neg.pickle', mode='rb') as fp:
    df_6 = pickle.load(fp)

In [14]:
print(df_3.shape)
print(df_4.shape)
print(df_6.shape)

(467, 13781)
(467, 18367)
(467, 24953)


In [15]:
features = [
    df_3.drop('Subclass', axis=1),
    df_4.drop('Subclass', axis=1),
    df_6.drop('Subclass', axis=1)
]

In [16]:
t = pd.DataFrame()
for i in features:
    n_comp = 5
    
    # tSVD
    start = time.time()
    tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
    tsvd_results = tsvd.fit_transform(i)
    elapsed_time = time.time() - start
    print('tSVD: '+str(elapsed_time))

    # PCA
    start = time.time()
    pca = PCA(n_components=n_comp, random_state=420)
    pca_results = pca.fit_transform(i)
    elapsed_time = time.time() - start
    print('PCA: '+str(elapsed_time))
    
    # ICA
    start = time.time()
    ica = FastICA(n_components=n_comp, random_state=420)
    ica_results = ica.fit_transform(i)
    elapsed_time = time.time() - start
    print('ICA: '+str(elapsed_time))

    # GRP
    start = time.time()
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
    grp_results = grp.fit_transform(i)
    elapsed_time = time.time() - start
    print('GRP: '+str(elapsed_time))

    # SRP
    start = time.time()
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
    srp_results = srp.fit_transform(i)
    elapsed_time = time.time() - start
    print('SRP: '+str(elapsed_time))
    
    # KPCA
    start = time.time()
    kpca = KernelPCA(n_components=n_comp, random_state=420)
    kpca_results = kpca.fit_transform(i)
    elapsed_time = time.time() - start
    print('KPCA: '+str(elapsed_time))
    
    # TSNE
    start = time.time()
    tsne = TSNE(n_components=3, random_state=420) # ValueError: 'n_components' should be inferior to 4 for the barnes_hut algorithm as it relies on quad-tree or oct-tree.
    tsne_results = tsne.fit_transform(i)
    elapsed_time = time.time() - start
    print('TSNE: '+str(elapsed_time))
    
    # NMF
    start = time.time()
    nmf = NMF(n_components=n_comp, random_state=420)
    nmf_results = nmf.fit_transform(i)
    elapsed_time = time.time() - start
    print('NMF: '+str(elapsed_time))

    # FAG
    start = time.time()
    fag = FeatureAgglomeration(n_clusters=n_comp)
    fag_results = fag.fit_transform(i)
    elapsed_time = time.time() - start
    print('FAG: '+str(elapsed_time))
    
    # merge each data 
    t = pd.concat([
        t, 
        pd.DataFrame(tsvd_results),
        pd.DataFrame(pca_results),
        pd.DataFrame(ica_results),
        pd.DataFrame(grp_results),
        pd.DataFrame(srp_results),
        pd.DataFrame(kpca_results),
        pd.DataFrame(tsne_results),
        pd.DataFrame(nmf_results),
        pd.DataFrame(fag_results),
    ], axis=1)

tSVD: 1.3017432689666748
PCA: 0.552377462387085
ICA: 0.8628473281860352
GRP: 0.04474139213562012
SRP: 0.09578347206115723
KPCA: 0.2928133010864258
TSNE: 28.278196811676025
NMF: 7.668501138687134
FAG: 34.757214307785034
tSVD: 0.4559614658355713
PCA: 0.6200871467590332
ICA: 0.5578718185424805
GRP: 0.0397038459777832
SRP: 0.013477563858032227
KPCA: 0.1979372501373291
TSNE: 27.236212015151978
NMF: 1.180950403213501
FAG: 65.04293203353882
tSVD: 0.7128145694732666
PCA: 0.858482837677002
ICA: 0.9825751781463623
GRP: 0.05462765693664551
SRP: 0.0179288387298584
KPCA: 0.435483455657959
TSNE: 28.535764932632446
NMF: 3.4790165424346924
FAG: 122.2194652557373


In [17]:
f = [
    'tSVD', 'PCA', 'ICA','GRP',
    'SRP', 'KPCA'
]

# make column's name
v = []
for m in [3, 4, 6]:
    for i in f:
        for l in range(n_comp):
            v.append(i+'_'+str(l)+'_'+str(m))

p=['NMF', 'FAG']
q = []
for m in [3, 4, 6]:
    for i in p:
        for l in range(n_comp):
            q.append(i+'_'+str(l)+'_'+str(m))

p=['TSNE']
tsne = []
for m in [3, 4, 6]:
    for i in p:
        for l in range(3):
            tsne.append(i+'_'+str(l)+'_'+str(m))
    
    
t.columns = v+ tsne +q

# to csv
f = pd.concat([ df_3.Subclass, t], axis=1)
f.to_csv('../data/decomp_neg.csv')