In [None]:
import numpy as np
import sys
import os
import pandas as pd

from pathlib import Path

import tensorflow as tf
from tensorflow.keras import layers, optimizers, utils, regularizers, backend as K

In [None]:
working_dir = Path('/d/data/neis/neis_nn')
data_path = working_dir / 'neis_nn.tsv'
meta_path = working_dir / 'meta.tsv'
output_path = working_dir / 'meta_fixed.tsv'


In [None]:
data_df = pd.read_csv(data_path, sep='\t', index_col=0)
meta_df = pd.read_csv(meta_path, sep='\t', index_col=0)

In [None]:
aligned_data, aligned_meta = data_df.align(meta_df, join='left', axis=0)

In [None]:
def importData(paintings_path, meta_path):
    '''
    imports the chromsome painting and meta data
    '''
    #for the paintings
    # if alt:
    #     df = read

    columns_to_use = ['Year', 'Site', 'Province']

    df = pd.read_csv(paintings_path, sep='\t', header=0, index_col=0)

    #for the meta
    meta_df = pd.read_csv(meta_path, sep='\t', header=0, index_col=0, keep_default_na=False)
    
    df, meta_df = df.align(meta_df, axis=0, join='inner')

    labels = pd.DataFrame(meta_df['DGI.flag'])
    
    meta_dict = {x: meta_df[x] for x in columns_to_use}
    meta = pd.DataFrame(meta_dict)
    
    return df, labels, meta

df, label_df, meta_df = importData(data_path, meta_path)

In [None]:
def balanceClasses(data, meta):
    #this function rebalances the datasets via oversampling to maybe get a better training and error measurement.
    #assumes meta is 0/1

    neg = np.asarray(meta == 0).nonzero()[0]
    pos = np.asarray(meta == 1).nonzero()[0]

    if neg.shape[0] < pos.shape[0]:
        shorter = neg
        longer = pos
    else:
        shorter = pos
        longer = neg
    fold = int(np.floor(longer.shape[0] / shorter.shape[0]))
    new_shorter = np.tile(shorter, fold)

    shorter = np.append(shorter, new_shorter)
    total = np.append(longer, shorter)
    rng = np.random.default_rng()
    rng.shuffle(total)
    
    new_data = data[total]
    new_meta = meta[total]
    
    return new_data, new_meta

In [262]:
from sklearn.preprocessing import MinMaxScaler, normalize, OneHotEncoder as ohe, OrdinalEncoder as oe, LabelEncoder as le
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from functools import reduce


labels = label_df.values
metadata = meta_df.values

# print(labels)
label_encoder = le()
labels = label_encoder.fit_transform(labels.reshape((-1,)))
print('Ladel classes: ' + str(label_encoder.classes_))

# meta_encoder = oe()
# meta_enc = meta_encoder.fit_transform(metadata)

# for i, x in enumerate(meta_df.columns):
#     df[x] = meta_enc[:,i]

data = df.values.astype(np.float64)
data_encoder = ohe(sparse=False, categories='auto')
data = data_encoder.fit_transform(data)

# data, labels = balanceClasses(data, labels)

# model = LinearSVC(C=5e-1, penalty="l1", loss = 'squared_hinge', dual=False, tol=1e-5, max_iter=1e6).fit(data, labels)
# coefs = np.abs(model.coef_).reshape((-1,))
def prefilter(data, labels):
    model = ExtraTreesClassifier(n_estimators=100).fit(data, labels)
    min_val=2e-3
    coefs = np.abs(model.feature_importances_).reshape((-1,))
    good_idx = np.where(coefs > min_val)[0]
    return good_idx

idx_list = [prefilter(data, labels) for x in range(50)]
greedy = reduce(np.union1d, idx_list)
conservative = reduce(np.intersect1d, idx_list)

print('{0} features selected through greedy prefiltering'.format(len(greedy)))


Ladel classes: ['DGI' 'Other']
236 features selected through greedy prefiltering


In [None]:
from sklearn.model_selection import KFold
n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True)
# out_path = in_path / 'results'
# out_path.mkdir(parents=True, exist_ok=True)

positive_cases = np.asarray(labels == 0).nonzero()[0] #indices
negative_cases = np.asarray(labels == 1).nonzero()[0]
print('detected {0} positive cases and {1} negative cases'.format(positive_cases.shape[0], negative_cases.shape[0]))


c = 0
lasso_errors = []
model_errors = []
for pos, neg in zip(kf.split(positive_cases), kf.split(negative_cases)):
    # print(len(train_index), len(test_index))
    train_index = np.append(positive_cases[pos[0]], negative_cases[neg[0]])
    test_index = np.append(positive_cases[pos[1]], negative_cases[neg[1]])

    f_data_train, f_data_test = data[train_index], data[test_index]
    meta_train, meta_test = labels[train_index], labels[test_index]

    f_data_train, meta_train = balanceClasses(f_data_train, meta_train)
    print(sum(meta_train))
    print(f_data_train.shape)
    print('')

In [None]:
#[ 641, 3208, 3594, 4498, 5827, 5828, 5835]
meta_train

In [267]:
labels = label_df.values

# print(labels)

labels = (labels.reshape((-1,)) == 'DGI')
print(labels)

[ True  True  True  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False Fa