In [1]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pandas as pd 
import matplotlib.pyplot as plt
from umap import UMAP
import xgboost as xgb
import lightgbm as lgb
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA, KernelPCA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold

In [2]:
with open('../data/HCD35_pos.pickle', mode='rb') as fp:
    df_3 = pickle.load(fp)
    
with open('../data/HCD45_pos.pickle', mode='rb') as fp:
    df_4 = pickle.load(fp)

with open('../data/HCD65_pos.pickle', mode='rb') as fp:
    df_6 = pickle.load(fp)

In [3]:
print(df_3.shape)
print(df_4.shape)
print(df_6.shape)

(1676, 39604)
(1676, 45702)
(1676, 45286)


In [4]:
features = [
    df_3.drop('Subclass', axis=1),
    df_4.drop('Subclass', axis=1),
    df_6.drop('Subclass', axis=1)
]

In [5]:
t = pd.DataFrame()
for i in features:
    n_comp = 50

    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
    tsvd_results = tsvd.fit_transform(i)

    # PCA
    pca = PCA(n_components=n_comp, random_state=420)
    pca_results = pca.fit_transform(i)
    
    # ICA
    ica = FastICA(n_components=n_comp, random_state=420)
    ica_results = ica.fit_transform(i)

    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
    grp_results = grp.fit_transform(i)

    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
    srp_results = srp.fit_transform(i)

    # KPCA
    kpca = KernelPCA(n_components=n_comp, random_state=420)
    kpca_results = kpca.fit_transform(i)

    # umap
    umap = UMAP(n_components=n_comp, random_state=420)
    umap_results = umap.fit_transform(i)

    t = pd.concat([t, pd.concat([
        pd.DataFrame(tsvd_results),
        pd.DataFrame(pca_results),
        pd.DataFrame(ica_results),
        pd.DataFrame(grp_results),
        pd.DataFrame(srp_results),
        pd.DataFrame(kpca_results),
        pd.DataFrame(umap_results)
    ], axis=1)], axis=1)

In [6]:
f = [
    'tSVD', 'PCA', 'ICA','GRP',
    'SRP', 'KPCA', 'UMAP'
]

v = []
for m in [3, 4, 6]:
    for i in f:
        for l in range(n_comp):
            v.append(i+'_'+str(l)+'_'+str(m))
            
t.columns = v
pd.concat([df_3['Subclass'], t], axis=1).to_csv('../data/decomp_pos.csv')

In [8]:
with open('../data/HCD35_neg.pickle', mode='rb') as fp:
    df_3 = pickle.load(fp)
    
with open('../data/HCD45_neg.pickle', mode='rb') as fp:
    df_4 = pickle.load(fp)

with open('../data/HCD65_neg.pickle', mode='rb') as fp:
    df_6 = pickle.load(fp)

In [9]:
print(df_3.shape)
print(df_4.shape)
print(df_6.shape)

(467, 13781)
(467, 18367)
(467, 24953)


In [10]:
features = [
    df_3.drop('Subclass', axis=1),
    df_4.drop('Subclass', axis=1),
    df_6.drop('Subclass', axis=1)
]

In [11]:
t = pd.DataFrame()
for i in features:
    n_comp = 50

    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
    tsvd_results = tsvd.fit_transform(i)

    # PCA
    pca = PCA(n_components=n_comp, random_state=420)
    pca_results = pca.fit_transform(i)
    
    # ICA
    ica = FastICA(n_components=n_comp, random_state=420)
    ica_results = ica.fit_transform(i)

    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
    grp_results = grp.fit_transform(i)

    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
    srp_results = srp.fit_transform(i)

    # KPCA
    kpca = KernelPCA(n_components=n_comp, random_state=420)
    kpca_results = kpca.fit_transform(i)

    # umap
    umap = UMAP(n_components=n_comp, random_state=420)
    umap_results = umap.fit_transform(i)

    t = pd.concat([t, pd.concat([
        pd.DataFrame(tsvd_results),
        pd.DataFrame(pca_results),
        pd.DataFrame(ica_results),
        pd.DataFrame(grp_results),
        pd.DataFrame(srp_results),
        pd.DataFrame(kpca_results),
        pd.DataFrame(umap_results)
    ], axis=1)], axis=1)

In [12]:
f = [
    'tSVD', 'PCA', 'ICA','GRP',
    'SRP', 'KPCA', 'UMAP'
]

v = []
for m in [3, 4, 6]:
    for i in f:
        for l in range(n_comp):
            v.append(i+'_'+str(l)+'_'+str(m))
            
t.columns = v
pd.concat([df_3['Subclass'], t], axis=1).to_csv('../data/decomp_neg.csv')