In [1]:
import pandas as pd
import seaborn as sns

In [2]:
TRAIN_DF = '../data/features/train_features.csv'
TRAIN_WIDE_DF = '../data/features/train_wide.csv'
TEST_DF = '../data/features/test_features.csv'
TEST_WIDE_DF = '../data/features/test_wide.csv'
SUBMISSION_FILE = '../data/input/sample_submission.csv'


In [7]:
train_df = pd.read_csv(TRAIN_DF).set_index('id', drop=False)
train_wide_df = pd.read_csv(TRAIN_WIDE_DF).set_index('id', drop=False)
test_df = pd.read_csv(TEST_DF).set_index('id', drop=False)
test_wide_df = pd.read_csv(TEST_WIDE_DF).set_index('id', drop=False)
submission_df = pd.read_csv(SUBMISSION_FILE).set_index('id', drop=False)

train_df.index.name = None
train_wide_df.index.name = None
test_df.index.name = None
test_wide_df.index.name = None
submission_df.index.name = None
# assert all have ids
assert all('id' in df.columns for df in [train_df, train_wide_df, test_df, test_wide_df, submission_df])


In [8]:
train_df_ids = set(train_df['id'])
train_wide_df_ids = set(train_wide_df['id'])
test_df_ids = set(test_df['id'])
test_wide_df_ids = set(test_wide_df['id'])
submission_ids = set(submission_df['id'])
len(train_df_ids), len(train_wide_df_ids), len(test_df_ids), len(test_wide_df_ids), len(submission_ids)


(2736, 996, 20, 2, 20)

In [9]:
# anything common in train_df and train_wide_df
len(train_wide_df_ids - train_df_ids), len(train_df_ids - train_wide_df_ids)


(0, 1740)

In [10]:
# anything common in test_df and test_wide_df
len(test_wide_df_ids - test_df_ids), len(test_df_ids - test_wide_df_ids)

(0, 18)

In [11]:
assert len([i for i in train_wide_df_ids if i not in train_df_ids]) == 0
assert len([i for i in test_wide_df_ids if i not in test_df_ids]) == 0

In [12]:
# assert that target is same in train_df and train_wide_df for wide ides
assert all(train_df[train_df.index.isin(train_wide_df_ids)].sort_values('id').sii == train_wide_df[train_wide_df.index.isin(train_wide_df_ids)].sort_values('id').sii)

In [13]:
from sklearn.model_selection import StratifiedKFold

In [28]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X, y = train_df.drop('sii', axis=1), train_df['sii']
for train_index, test_index in kf.split(X, y):
    train_ids = set(X.iloc[train_index]['id'])
    print(f"number of ids from train wide: {len([i for i in train_wide_df_ids if i in train_ids])}")
    val_ids = set(X.iloc[test_index]['id'])
    print(f"number of ids from val wide: {len([i for i in train_wide_df_ids if i in val_ids])}")
    assert len([i for i in train_ids if i not in train_df_ids]) == 0 and len([i for i in val_ids if i not in train_df_ids]) == 0
    val_wide_ids = [i for i in train_wide_df_ids if i in val_ids]

    ## TRAINING DATA
    # narrow data to train
    train_df_ = X.iloc[train_index].drop(columns='id').reset_index(drop=True)
    train_y_ = y.iloc[train_index].reset_index(drop=True)

    # wide data to train
    train_wide_df_ = train_wide_df[train_wide_df.index.isin(train_ids)].drop(columns='id').reset_index(drop=True)
    train_wide_y_ = train_wide_df[train_wide_df.index.isin(train_ids)]['sii'].reset_index(drop=True)

    ## VALIDATION DATA
    # narrow data to val
    val_df_ = X.iloc[test_index].drop(columns='id').reset_index(drop=True)
    val_y_ = y.iloc[test_index].reset_index(drop=True)

    # wide data to val to be combined with weights
    val_wide_df_ = train_wide_df[train_wide_df.index.isin(val_ids)].sort_values('id').drop(columns='id').reset_index(drop=True)
    val_wide_y_ = train_wide_df[train_wide_df.index.isin(val_ids)].sort_values('id').drop(columns='id').reset_index(drop=True)[['sii']]

    val_wide_df_from_narrow_ = X[X.index.isin(val_wide_ids)].sort_values('id').drop(columns='id').reset_index(drop=True)
    val_wide_y_from_narrow_ = y[y.index.isin(val_wide_ids)].rename_axis('id').reset_index().sort_values('id').drop(columns='id').reset_index(drop=True)


    # assert that val_wide have same length as val_df
    assert len(val_wide_df_from_narrow_) == len(val_wide_df_), f"{len(val_wide_df_from_narrow_)} != {len(val_wide_df_)}"
    # assert that both val wide target are same
    assert all(val_wide_y_from_narrow_ == val_wide_y_), f"{val_wide_y_from_narrow_} != {val_wide_y_}"



number of ids from train wide: 801
number of ids from val wide: 195
number of ids from train wide: 769
number of ids from val wide: 227
number of ids from train wide: 816
number of ids from val wide: 180
number of ids from train wide: 796
number of ids from val wide: 200
number of ids from train wide: 802
number of ids from val wide: 194


0       2
1       0
2       0
3       1
4       1
       ..
2731    0
2732    1
2733    1
2734    1
2735    0
Name: sii, Length: 2736, dtype: int64

(2736, 2736)