# Import Libraries and Data

In [1]:
import numpy as np
import pandas as pd
pd.get_option("display.max_columns")
pd.set_option('display.max_columns', 300)
pd.get_option("display.max_rows")
pd.set_option('display.max_rows', 300)

import matplotlib.pyplot as plt
%matplotlib inline

import os
from os.path import join as opj
import gc

#INPUT_PATH = '../../input/feedback-prize-effectiveness/'
INPUT_PATH = '../../00_EDA/00_v2_04/result/unlabeled.csv'

#LABEL = 'discourse_effectiveness'

In [2]:
# train_df = pd.read_csv(opj(INPUT_PATH, 'train.csv'))
# test_df = pd.read_csv(opj(INPUT_PATH, 'test.csv'))
# sub_df = pd.read_csv(opj(INPUT_PATH, 'sample_submission.csv'))

# print('train_df.shape = ', train_df.shape)
# print('test_df.shape = ', test_df.shape)
# print('sub_df.shape = ', sub_df.shape)

train_df = pd.read_csv(INPUT_PATH)
print('train_df.shape = ', train_df.shape)

train_df.shape =  (107528, 9)


# Create Folds

In [3]:
train_df.head()

Unnamed: 0,essay_id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,neighbor_text
0,A8445CABFECE,1622576000000.0,18.0,85.0,Drivers should not be able to use phones while...,Position,Position 1,3 4 5 6 7 8 9 10 11 12 13 14,[POSITION]Drivers should not be able to use ph...
1,A8445CABFECE,1622576000000.0,86.0,202.0,Drivers who used their phone while operating a...,Claim,Claim 1,15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 3...,[POSITION]Drivers should not be able to use ph...
2,A8445CABFECE,1622576000000.0,203.0,1030.0,According to an article by the Edgar Snyder Fi...,Evidence,Evidence 1,36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 5...,[POSITION]Drivers should not be able to use ph...
3,A8445CABFECE,1622576000000.0,1031.0,1243.0,"In conclusion, drivers should not able to work...",Concluding Statement,Concluding Statement 1,177 178 179 180 181 182 183 184 185 186 187 18...,[POSITION]Drivers should not be able to use ph...
4,6B4F7A0165B9,1622644000000.0,36.0,512.0,The ability to stay connected to people we kno...,Lead,Lead 1,5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 ...,[LEAD]The ability to stay connected to people ...


In [4]:
unique_ids = np.array(sorted(train_df['essay_id'].unique()))
unique_ids

array(['0000D23A521A', '001552828BD0', '0019E4D09427', ...,
       'FFF1442D6698', 'FFF1ED4F8544', 'FFFD0AF13501'], dtype='<U12')

In [5]:
import numpy as np
from sklearn.model_selection import KFold

def get_fold_ids_list(n_folds, ids, seed):
    kfold = KFold(n_splits=n_folds,
                  shuffle=True,
                  random_state=seed)
    trn_ids_list = []
    val_ids_list = []
    for trn_idx, val_idx in kfold.split(ids,ids):
        trn_ids_list.append(ids[np.array(trn_idx)])
        val_ids_list.append(ids[np.array(val_idx)])
    return trn_ids_list, val_ids_list

In [6]:
N_FOLDS = 5
SEED = 2022

trn_ids_list, val_ids_list = get_fold_ids_list(n_folds=N_FOLDS,
                                               ids=unique_ids,
                                               seed=SEED)

In [7]:
import joblib
import os

os.makedirs('./result', exist_ok=True)
joblib.dump(trn_ids_list, './result/trn_ids_list.joblib')
joblib.dump(val_ids_list, './result/val_ids_list.joblib')

['./result/val_ids_list.joblib']

In [8]:
for fold in range(N_FOLDS):
    print('fold = ', fold)
    trn_df = train_df[train_df['essay_id'].isin(trn_ids_list[fold])].reset_index()
    val_df = train_df[train_df['essay_id'].isin(val_ids_list[fold])].reset_index()
    print('trn_df.shape = {}, val_df.shape = {}'.format(trn_df.shape, val_df.shape))
    print('')

fold =  0
trn_df.shape = (85917, 10), val_df.shape = (21611, 10)

fold =  1
trn_df.shape = (85928, 10), val_df.shape = (21600, 10)

fold =  2
trn_df.shape = (86175, 10), val_df.shape = (21353, 10)

fold =  3
trn_df.shape = (85889, 10), val_df.shape = (21639, 10)

fold =  4
trn_df.shape = (86203, 10), val_df.shape = (21325, 10)



In [9]:
for fold in range(N_FOLDS):
    print('fold = ', fold)
    val_df = train_df[train_df['essay_id'].isin(val_ids_list[fold])].reset_index()
    display(val_df['discourse_type'].value_counts())
    print('')

fold =  0


Claim                   7693
Evidence                6759
Position                2279
Concluding Statement    2037
Lead                    1420
Counterclaim             803
Rebuttal                 620
Name: discourse_type, dtype: int64


fold =  1


Claim                   7698
Evidence                6739
Position                2276
Concluding Statement    2037
Lead                    1424
Counterclaim             813
Rebuttal                 613
Name: discourse_type, dtype: int64


fold =  2


Claim                   7517
Evidence                6671
Position                2277
Concluding Statement    2038
Lead                    1418
Counterclaim             800
Rebuttal                 632
Name: discourse_type, dtype: int64


fold =  3


Claim                   7688
Evidence                6757
Position                2279
Concluding Statement    2035
Lead                    1391
Counterclaim             845
Rebuttal                 644
Name: discourse_type, dtype: int64


fold =  4


Claim                   7635
Evidence                6671
Position                2284
Concluding Statement    2007
Lead                    1361
Counterclaim             783
Rebuttal                 584
Name: discourse_type, dtype: int64


