* create folds (groupby essay_id), 5folds, seed=100  

# Import Libraries and Data

In [20]:
import numpy as np
import pandas as pd
pd.get_option("display.max_columns")
pd.set_option('display.max_columns', 300)
pd.get_option("display.max_rows")
pd.set_option('display.max_rows', 300)

import matplotlib.pyplot as plt
%matplotlib inline

import os
from os.path import join as opj
import gc

INPUT_PATH = '../../input/feedback-prize-effectiveness/'

LABEL = 'discourse_effectiveness'

In [2]:
train_df = pd.read_csv(opj(INPUT_PATH, 'train.csv'))
test_df = pd.read_csv(opj(INPUT_PATH, 'test.csv'))
sub_df = pd.read_csv(opj(INPUT_PATH, 'sample_submission.csv'))

print('train_df.shape = ', train_df.shape)
print('test_df.shape = ', test_df.shape)
print('sub_df.shape = ', sub_df.shape)

train_df.shape =  (36765, 5)
test_df.shape =  (10, 4)
sub_df.shape =  (10, 4)


# Create Folds

In [3]:
train_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [4]:
train_df['essay_id'].value_counts()

91B1F82B2CF1    23
900A879708F0    23
4CA37D113612    23
A7EC6F462F8B    22
DECAE402BB38    22
                ..
1D4FBE746B88     1
5E295D2E7B76     1
EA58637960FB     1
E4559E81E304     1
A32414F5B216     1
Name: essay_id, Length: 4191, dtype: int64

In [9]:
unique_ids = np.array(sorted(train_df['essay_id'].unique()))
unique_ids

array(['00066EA9880D', '000E6DE9E817', '0016926B079C', ...,
       'FFD97A99CEBA', 'FFF868E06176', 'FFFF80B8CC2F'], dtype='<U12')

In [14]:
import numpy as np
from sklearn.model_selection import KFold

def get_fold_ids_list(n_folds, ids, seed):
    kfold = KFold(n_splits=n_folds,
                  shuffle=True,
                  random_state=seed)
    trn_ids_list = []
    val_ids_list = []
    for trn_idx, val_idx in kfold.split(ids,ids):
        trn_ids_list.append(ids[np.array(trn_idx)])
        val_ids_list.append(ids[np.array(val_idx)])
    return trn_ids_list, val_ids_list

In [15]:
N_FOLDS = 5
SEED = 100

trn_ids_list, val_ids_list = get_fold_ids_list(n_folds=N_FOLDS,
                                               ids=unique_ids,
                                               seed=SEED)

In [16]:
import joblib
import os

os.makedirs('./result', exist_ok=True)
joblib.dump(trn_ids_list, './result/trn_ids_list.joblib')
joblib.dump(val_ids_list, './result/val_ids_list.joblib')

['./result/val_ids_list.joblib']

In [19]:
for fold in range(N_FOLDS):
    print('fold = ', fold)
    trn_df = train_df[train_df['essay_id'].isin(trn_ids_list[fold])].reset_index()
    val_df = train_df[train_df['essay_id'].isin(val_ids_list[fold])].reset_index()
    print('trn_df.shape = {}, val_df.shape = {}'.format(trn_df.shape, val_df.shape))
    print('')

fold =  0
trn_df.shape = (29421, 6), val_df.shape = (7344, 6)

fold =  1
trn_df.shape = (29480, 6), val_df.shape = (7285, 6)

fold =  2
trn_df.shape = (29266, 6), val_df.shape = (7499, 6)

fold =  3
trn_df.shape = (29412, 6), val_df.shape = (7353, 6)

fold =  4
trn_df.shape = (29481, 6), val_df.shape = (7284, 6)



In [23]:
for fold in range(N_FOLDS):
    print('fold = ', fold)
    val_df = train_df[train_df['essay_id'].isin(val_ids_list[fold])].reset_index()
    display(val_df[LABEL].value_counts())
    print('')

fold =  0


Adequate       4149
Effective      1886
Ineffective    1309
Name: discourse_effectiveness, dtype: int64


fold =  1


Adequate       4280
Effective      1666
Ineffective    1339
Name: discourse_effectiveness, dtype: int64


fold =  2


Adequate       4288
Effective      1999
Ineffective    1212
Name: discourse_effectiveness, dtype: int64


fold =  3


Adequate       4110
Effective      1990
Ineffective    1253
Name: discourse_effectiveness, dtype: int64


fold =  4


Adequate       4150
Effective      1785
Ineffective    1349
Name: discourse_effectiveness, dtype: int64


