In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import mean_squared_error, accuracy_score
RMSE = lambda y_true,y_pred: np.sqrt(mean_squared_error(y_true, y_pred))

In [2]:
!ls ./data/concat/

seen_test.csv	shuffled_test.csv   unseen_test.csv
seen_train.csv	shuffled_train.csv  unseen_train.csv


In [3]:
FEATURE_AGGREGATION = 'concat'
TRAIN_TEST_PARTITION = 'shuffled'

In [4]:
data_tr = pd.read_csv(f"./data/{FEATURE_AGGREGATION}/{TRAIN_TEST_PARTITION}_train.csv", index_col=0)
data_ts = pd.read_csv(f"./data/{FEATURE_AGGREGATION}/{TRAIN_TEST_PARTITION}_test.csv", index_col=0)

In [5]:
Xtr = data_tr.drop('target',axis=1).values
Xts = data_ts.drop('target',axis=1).values

ytr = data_tr.target.values
yts = data_ts.target.values

### RBFs

In [6]:
# from sklearn.decomposition import PCA

# vis = PCA(n_components=5).fit_transform(Xtr)

# from sklearn.cluster import KMeans

# label = KMeans(n_clusters=5).fit_predict(Xtr)

# plt.figure(figsize=(10,10))
# plt.scatter(vis[:,0],vis[:,3], alpha=0.1, c=label, cmap='tab10')
# plt.scatter(vis[ytr==1,0], vis[ytr==1,3], c='k', marker='x');

### Linear regression

In [7]:
from linear_model import LinRegression, LogRegression, RBF_transformer
from sklearn.pipeline import Pipeline

In [21]:
linear = Pipeline([
    ('rbf',RBF_transformer(25, cache=True)),
    ('lin_r', LinRegression(lr=0.01, metric='accuracy'))
])

logistic = Pipeline([
    ('rbf', RBF_transformer(25, cache=True)),
    ('log_r', LogRegression(lr=0.01, metric='accuracy', class_weights=[1,100]))
])

In [22]:
linear.fit(Xtr,ytr,lin_r__verbose=True, lin_r__n_epochs=500);

picked cached RBFs


In [23]:
logistic.fit(Xtr,ytr,log_r__verbose=True, log_r__n_epochs=500);

picked cached RBFs


In [24]:
(logistic.predict_proba(Xts) > 0.5).sum()

0

In [11]:
print(f"""
linear RMSE:   {linear.score(Xts,yts):.4f}
logistic Accuracy:  {logistic.score(Xts,yts):.4f}
""")


linear RMSE:   0.9970
logistic Accuracy:  0.9970



### Dense neural network with PyTorch

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

In [13]:
sampler = WeightedRandomSampler(1+ytr*99, num_samples=10, replacement=False)

In [34]:
data = ptDataset(Xtr,ytr)

In [14]:
net = Net(20, 2, hidden_size=32, lr=0.001)

In [84]:
class npDataset(Dataset):
    def __init__(self, X, y, normalize=False):
        if normalize:
            Xn = X / X.max(axis=0)
        else:
            Xn = X

        self.X = Xn
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return {
            'X': self.X[idx],
            'y': self.y[idx]
        }

In [15]:
net.fit(Xtr, ytr, n_epochs=50, batch_size=1000, verbose=True, class_weight=[1., 100.])

KeyboardInterrupt: 

In [None]:
net(torch.tensor(Xtr).float()).argmax(dim=1).float().mean()

### keras

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras.optimizers import SGD

In [None]:
inp = Input((20, ))
x = Dense(16, activation='relu')(inp)
x = Dense(4, activation='relu')(x)
x = Dense(2, activation='softmax')(x)

model = Model(inputs=[inp], outputs=[x])

In [None]:
model.compile(optimizer='SGD', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
pred = model.predict(Xts / Xts.max(axis=0))

In [None]:
(yts == pred.argmax(axis=1)).mean()

In [None]:
model.fit(Xts, np.c_[1-yts,yts], batch_size=100, epochs=10000, class_weight=[1,10])

In [None]:
Xtr = Xtr / Xtr.max(axis=0)

In [None]:
model.predict(Xtr[:10])

In [None]:
from keras.backend import get_session

In [None]:
sess = get_session()

In [None]:
sess.run(model.weights[0])