In [3]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Concatenate
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import f1_score

# ======== REPRO ========
np.random.seed(42)
tf.random.set_seed(42)

# ======== PARAMS ========
WINDOW = 3
EPOCHS = 50
BALANCE = True
LEARNING_RATE = 1e-4
BATCH_SIZE = 64
FEATURES = [f"X{i}" for i in range(1, 19)]

# ======== LOAD ========
train_path = "dataset/train.csv"
test_path  = "dataset/df1_matches.csv"

df_tr = pd.read_csv(train_path)
df_te = pd.read_csv(test_path)

# Clean
for df in (df_tr, df_te):
    for c in ["Unnamed: 0", "Division", "MajorGroup"]:
        if c in df.columns:
            df.drop(columns=c, inplace=True)
    df.sort_values(["company_name", "fyear"], inplace=True)

# Encode target
label_map = {"alive": 0, "failed": 1}
df_tr["status"] = df_tr["status_label"].map(label_map)
df_te["status"] = df_te["status_label"].map(label_map)

# ======== SEQUENCES ========
def make_sequences(df, window, features, group_col="company_name", time_col="fyear", target_col="status"):
    Xs, ys = [], []
    for _, g in df.groupby(group_col):
        g = g.sort_values(time_col)
        if len(g) >= window:
            for i in range(len(g) - window + 1):
                seq = g.iloc[i:i+window]
                Xs.append(seq[features].values)   # (W, F)
                ys.append(seq[target_col].values[-1])
    return np.array(Xs, dtype=np.float32), np.array(ys, dtype=np.int64)

X_tr, y_tr = make_sequences(df_tr, WINDOW, FEATURES)
X_te, y_te = make_sequences(df_te, WINDOW, FEATURES)

print("Train sequences:", X_tr.shape, "labels:", np.bincount(y_tr))
print("Test  sequences:", X_te.shape, "labels:", np.bincount(y_te))

# ======== BALANCE (undersample) ========
if BALANCE and len(np.unique(y_tr)) == 2:
    idx0 = np.where(y_tr == 0)[0]
    idx1 = np.where(y_tr == 1)[0]
    n = min(len(idx0), len(idx1))
    sel = np.concatenate([
        np.random.choice(idx0, n, replace=False),
        np.random.choice(idx1, n, replace=False),
    ])
    np.random.shuffle(sel)
    X_tr, y_tr = X_tr[sel], y_tr[sel]
    print("Balanced train:", X_tr.shape, "labels:", np.bincount(y_tr))

# ======== NORMALIZE using TRAIN stats ========
mean = X_tr.mean(axis=(0,1))
std  = X_tr.std(axis=(0,1))
std[std == 0] = 1.0
X_tr = (X_tr - mean) / std
X_te = (X_te - mean) / std

# ======== MODEL (multi-head per feature, like your reference) ========
inputs, heads = [], []
for _ in FEATURES:
    inp = tf.keras.Input(shape=(WINDOW, 1))
    h = LSTM(WINDOW)(inp)      # small units to mirror reference
    inputs.append(inp)
    heads.append(h)

merged = Concatenate()(heads)
dense1 = Dense(20, activation='relu')(merged)
out = Dense(2, activation='softmax')(dense1)

model = Model(inputs=inputs, outputs=out)
model.compile(
    optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Prepare split inputs
Xtr_split = [X_tr[:,:,i].reshape(-1, WINDOW, 1) for i in range(X_tr.shape[2])]
Xte_split = [X_te[:,:,i].reshape(-1, WINDOW, 1) for i in range(X_te.shape[2])]

# ======== TRAIN on FULL TRAINING DATA (no val split) ========
model.fit(
    Xtr_split, to_categorical(y_tr, num_classes=2),
    epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1, shuffle=True
)

# ======== EVAL on TEST ========
y_prob = model.predict(Xte_split, verbose=0)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)
macro_f1 = f1_score(y_te, y_pred, average='macro')
print(f"\nMacro F1 on test set: {macro_f1:.4f}")


Train sequences: (49043, 3, 18) labels: [45806  3237]
Test  sequences: (12452, 3, 18) labels: [11677   775]
Balanced train: (6474, 3, 18) labels: [3237 3237]
Epoch 1/50
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 10ms/step - accuracy: 0.5080 - loss: 0.6930
Epoch 2/50
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.5318 - loss: 0.6913
Epoch 3/50
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.5531 - loss: 0.6898
Epoch 4/50
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.5650 - loss: 0.6883
Epoch 5/50
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.5817 - loss: 0.6869
Epoch 6/50
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.5901 - loss: 0.6855
Epoch 7/50
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.5941 - loss: 0

In [4]:
import pandas as pd

df = pd.read_csv("dataset/train.csv")
print(df.shape)
print(df.columns)
print(df.head())


(62789, 24)
Index(['Unnamed: 0', 'company_name', 'fyear', 'status_label', 'X1', 'X2', 'X3',
       'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14',
       'X15', 'X16', 'X17', 'X18', 'Division', 'MajorGroup'],
      dtype='object')
   Unnamed: 0 company_name   fyear status_label        X1          X2  \
0           0          C_1  1999.0        alive  511267.0  740998.000   
1           1          C_1  2000.0        alive  485856.0     701.854   
2           2          C_1  2001.0        alive  436656.0  710199.000   
3           3          C_1  2002.0        alive  396412.0     686.621   
4           4          C_1  2003.0        alive  432204.0     709.292   

         X3        X4        X5        X6  ...        X11         X12  \
0  833107.0  180447.0  18373.00  70658.00  ...     35.163  201026.000   
1  713811.0  179987.0  18577.00     45.79  ...  18531.000  204065.000   
2  526477.0  217699.0  22496.00   4711.00  ...    -58.939     139.603   
3  496747.0  16

In [5]:
import pandas as pd

df = pd.read_csv("dataset/df1_matches.csv")
print(df.shape)
print(df.columns)
print(df.head())


(15893, 23)
Index(['company_name', 'fyear', 'status_label', 'X1', 'X2', 'X3', 'X4', 'X5',
       'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16',
       'X17', 'X18', 'Division', 'MajorGroup'],
      dtype='object')
  company_name   fyear status_label        X1        X2       X3      X4  \
0          C_3  1999.0        alive  9757.000  13986.00  19796.0  5974.0   
1          C_3  2000.0        alive     7.884  11608.00  16506.0  4875.0   
2          C_3  2001.0        alive  6494.000   8635.00     15.7  3873.0   
3          C_3  2002.0        alive  5938.000      7.85  12919.0  2546.0   
4          C_3  2004.0        alive  5807.000   6245.00  12018.0   222.0   

        X5        X6      X7  ...       X11     X12     X13       X14  \
0  667.000  -932.000  -265.0  ... -2207.000 -6375.0  3924.0     29.37   
1    0.700    -0.028   672.0  ...    -0.808 -7184.0  3244.0  25367.00   
2    0.761    -0.380   381.0  ...    -1.738 -8922.0  2677.0  24051.00   
3  355.000 