In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from utils import regression_model, l1_loss, make_final_prediction, select_best_model_and_predict
from sklearn.preprocessing import StandardScaler
import pickle

np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

compilation of mae_val score of all NN models in step 2

In [2]:
emb_size = 32

with open(f'./submission/scaler_{emb_size}.pkl', 'rb') as f:
    X_scaler = pickle.load(f)

In [3]:
for n_try in range(8):
    globals()[f"score{n_try}"] = pd.read_csv(f"./submission/predictions/NN_{emb_size}_score_try{n_try}.csv", index_col=0)

score = pd.concat([globals()[f"score{n_try}"] for n_try in range(8)])
score.reset_index(inplace=True)

precomputed gene embedding

In [4]:
g_embed = np.load(f'submission/embedding/gene_embedding_{emb_size}.npy')
g_name = np.load(f'submission/embedding/gene_names_{emb_size}.npy', allow_pickle= True)

transform the gene embedding by StandardScaler

In [5]:
#transform test set
g_name_test = ['Aqr', 'Bach2', 'Bhlhe40', 'Ets1', 'Fosb', 'Mafk', 'Stat3'] 
X_test = g_embed[[list(g_name).index(g) for g in g_name_test]]
X_test_transformed = X_scaler.transform(X_test)

compute predictions of all 15077 genes of all best 2 models for each k-fold dataset

In [10]:
best_n = 2
min_distance = 4e-3
min_mae_val = 0.1

Y_heldout_pred = []

predictions = select_best_model_and_predict(
    score,
    emb_size,
    X_test_transformed,
    best_n = best_n,
    min_mae_val=min_mae_val,
)

for g in g_name_test:
    print(f'prediction gene {g}')
    p = predictions[:,list(g_name_test).index(g),:]
    y_pred = make_final_prediction(p,
                                   min_distance = min_distance, distance_step=5e-5, 
                                   f=mean_squared_error,
                                   verbose=1
                                  )
    Y_heldout_pred.append(y_pred)
    print(y_pred)

prediction gene Aqr
--max number of similar vector for min_distance 0.004: 8/19
- [[0.179 0.216 0.241 0.321 0.043]]
-mean:  [0.179 0.216 0.241 0.321 0.043] 

[0.179 0.216 0.241 0.321 0.043]
prediction gene Bach2
--max number of similar vector for min_distance 0.004: 13/19
- [[0.114 0.183 0.267 0.402 0.034]
 [0.074 0.195 0.316 0.387 0.028]
 [0.084 0.199 0.322 0.372 0.023]]
-mean:  [0.091 0.193 0.302 0.387 0.028] 

[0.091 0.193 0.302 0.387 0.028]
prediction gene Bhlhe40
--max number of similar vector for min_distance 0.004: 9/19
- [[0.294 0.175 0.224 0.262 0.045]]
-mean:  [0.294 0.175 0.224 0.262 0.045] 

[0.294 0.175 0.224 0.262 0.045]
prediction gene Ets1
--max number of similar vector for min_distance 0.004: 8/19
- [[0.263 0.272 0.171 0.258 0.036]]
-mean:  [0.263 0.272 0.171 0.258 0.036] 

[0.263 0.272 0.171 0.258 0.036]
prediction gene Fosb
--max number of similar vector for min_distance 0.004: 10/19
- [[0.130 0.188 0.297 0.357 0.028]
 [0.086 0.201 0.335 0.336 0.042]]
-mean:  [0.108 

# Save the output

In [7]:
assert g_name_test == ['Aqr', 'Bach2', 'Bhlhe40', 'Ets1', 'Fosb', 'Mafk', 'Stat3']
Y_heldout_pred = np.array(Y_heldout_pred)
Y_heldout_pred[:,4] = 1- np.sum(Y_heldout_pred[:,:4], axis=-1)

In [8]:
df = pd.DataFrame(Y_heldout_pred[:3], columns=['a_i','b_i','c_i','d_i','e_i'])
df['gene'] = ['Aqr', 'Bach2', 'Bhlhe40']
df.index = df['gene']
df.drop(columns=['gene'], inplace=True)
df.to_csv('../solution/validation_output.csv')

In [9]:
df = pd.DataFrame(Y_heldout_pred[3:], columns=['a_i','b_i','c_i','d_i','e_i'])
df['gene'] = [ 'Ets1', 'Fosb', 'Mafk', 'Stat3']
df.index = df['gene']
df.drop(columns=['gene'], inplace=True)
df.to_csv('../solution/test_output.csv')