In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import mean_squared_error, accuracy_score
RMSE = lambda y_true,y_pred: np.sqrt(mean_squared_error(y_true, y_pred))

In [2]:
!ls ./data/

concat	subtr


In [3]:
FEATURE_AGGREGATION = 'subtr'
TRAIN_TEST_PARTITION = 'shuffled'

In [4]:
data_tr = pd.read_csv(f"./data/{FEATURE_AGGREGATION}/{TRAIN_TEST_PARTITION}_train.csv", index_col=0)
data_ts = pd.read_csv(f"./data/{FEATURE_AGGREGATION}/{TRAIN_TEST_PARTITION}_test.csv", index_col=0)

In [5]:
Xtr = data_tr.drop('target',axis=1).values
Xts = data_ts.drop('target',axis=1).values

ytr = data_tr.target.values
yts = data_ts.target.values

In [6]:
idx = np.argwhere(yts == 1)[:,0]

### Linear regression

In [7]:
%load_ext autoreload
%autoreload 1

In [8]:
%aimport linear_model
from sklearn.pipeline import Pipeline

In [28]:
use_RBFs = False

In [189]:
if use_RBFs:
    linear = Pipeline([
        ('rbf', linear_model.RBF_transformer(15, cache=True)),
        ('lin_r', linear_model.LinRegression(n_features=16, metric='accuracy'))
    ])

    logistic = Pipeline([
        ('rbf', linear_model.RBF_transformer(15, cache=True)),
        ('log_r', linear_model.LogRegression(n_features=16, metric='accuracy'))
    ])
else:
    linear = Pipeline([
        ('lin_r', linear_model.LinRegression(n_features=Xtr.shape[1], metric='accuracy', lambda_=100))
    ])
    logistic = Pipeline([
        ('log_r', linear_model.LogRegression(n_features=Xtr.shape[1], metric='accuracy', lambda_=100))
    ])

In [190]:
sampler = linear_model.Subsampler(1, y_oh=False, normalize=True, neg_weight=1)

In [191]:
sampler.fit(Xts, yts);

In [192]:
Xts_ss, yts_ss = next(iter(sampler))

In [193]:
sampler = linear_model.Subsampler(100, y_oh=False, normalize=True, neg_weight=1)

In [194]:
linear.fit(Xtr, ytr,
           lin_r__batch_generator = sampler,
           lin_r__n_epochs=2,
           lin_r__valid_set=(Xts_ss, yts_ss));

In [195]:
pred_tr = linear.predict(Xtr)

In [196]:
logistic.fit(Xtr, ytr,
           	 log_r__batch_generator = sampler,
           	 log_r__n_epochs=200,
           	 log_r__valid_set=(Xts_ss, yts_ss));

---
### keras

In [188]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras.optimizers import SGD

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
inp = Input((11, ))
x = Dense(16, activation='relu')(inp)
x = Dense(4, activation='relu')(x)
x = Dense(2, activation='softmax')(x)

model = Model(inputs=[inp], outputs=[x])

In [None]:
model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
def subsample(X,y, y_oh = True, normalize=True, neg_weight=1):
    pos_idx = np.argwhere(y == 1)[:, 0]
    neg_idx = np.argwhere(y == 0)[:, 0]
    n_pos = len(pos_idx)
    X_ = X / X.max(axis=0) if normalize else X 
    y_ = np.c_[1-y, y] if y_oh else y
    
    for i in range(100):
        neg_subsample = np.random.choice(neg_idx, int(n_pos*neg_weight), replace=False)
        idx = np.r_[pos_idx, neg_subsample]
        np.random.shuffle(idx)
        yield X_[idx], y_[idx]

In [None]:
class usDataset():
    def __init__(self, X, y, ):
        self.neg_weight = 1
        self.pos_idx = np.argwhere(y == 1)[:, 0]
        self.neg_idx = np.argwhere(y == 0)[:, 0]
        self.n_pos = len(self.pos_idx)
        
        if normalize:
            self.X = X / X.max(axis=0)
        else:
            self.X = X
        if y_oh:
            self.y = np.c_[1-y, y]
        else:
            self.y = y
        
    def __next__(self):
        neg_subsample = np.random.choice(self.neg_idx, self.n_pos*self.neg_weight, replace=False)
        idx = np.r_[self.pos_idx, neg_subsample]
        np.random.shuffle(idx)
        return self.X[idx], self.y[idx]

In [None]:
gen_train = usDataset(Xtr, ytr, neg_weight=3)

In [None]:
Xv,yv = next(usDataset(Xts, yts, neg_weight=1))

In [None]:
model.fit_generator(gen_train, steps_per_epoch=5000, epochs=200, validation_data=(Xv,yv), workers=3)

In [None]:
pred = model.predict(gen_train.X).argmax(axis=1)

In [None]:
(pred == ytr).mean()

In [None]:
pred_test = model.predict(Xts / Xts.max(axis=0)).argmax(axis=1)

In [None]:
(pred_test == yts).mean()

In [None]:
pred_test[yts == 1].mean()

In [None]:
from tqdm._tqdm_notebook import tqdm_notebook

In [None]:
import sys, os

In [None]:
os.devnull

In [None]:
?tqdm_notebook

In [None]:
class EmptyFile:
    def write(self, string):
        pass
    def flush(self):
        pass

In [None]:
import time