#### This notebook re-labels "exemplification" examples as "generalization", and breaks down the main dataset CSV into three stratified folds (CSVs).

In [1]:
import pandas as pd

df = pd.read_csv('/home/ndg/users/sbagga1/generalization/data/Gen_Sentences_Annotated_All_Final.csv')
print("IDs: ", df['ID'].nunique())
print("Shape: ", df.shape)
df.head(10)

IDs:  3456
Shape:  (3456, 10)


Unnamed: 0,section,sent.no,filename,sentences,neutral,generalization,exemplification,attribution,conditional,ID
0,e,156,nlh.47.1.626118_nonotes.txt,"To this end, one of the main merits of Merleau...",1,0,0,0,0,1
1,b,207,ahr.2016.121.2.437_nonotes.txt,In their response they chastised her for her u...,1,0,0,0,0,2
2,b,180,ahr.2016.121.1.17_nonotes.txt,VOC officials who encountered these arguments ...,1,0,0,0,0,3
3,e,288,asr.2016.81.5.1039_nonotes.txt,In YEAR—for the first time since YEAR—white wo...,0,1,0,0,0,4
4,e,171,sr.55.2.05_nonotes.txt,"With its large type, ostentatious margins, an...",0,1,0,0,0,5
5,b,216,modernismmodernity.2016.23.4.771_nonotes.txt,"A plastic, adaptable subject turned into an ar...",0,0,1,0,0,6
6,e,291,modernismmodernity.2016.23.4.875_nonotes.txt,Hence the argument that we must not go down a ...,1,0,0,0,0,7
7,e,207,ci.42.2.684358_nonotes.txt,"And thus, although the fact that Niobe does no...",1,0,0,0,0,8
8,b,237,pmla.2016.131.1.84_nonotes.txt,"On the other hand there was lunarism, which he...",1,0,0,0,0,9
9,b,207,pmla.2016.131.5.1222_nonotes.txt,"Of this group of islands, Grande Comore, Anjou...",1,0,0,0,0,10


In [2]:
df.loc[(df['neutral']==1) | (df['generalization']==1)].shape

(3253, 10)

In [3]:
none = df.loc[(df['neutral']==0) & (df['generalization']==0)]
print(none['exemplification'].value_counts())
none.head()

1    203
Name: exemplification, dtype: int64


Unnamed: 0,section,sent.no,filename,sentences,neutral,generalization,exemplification,attribution,conditional,ID
5,b,216,modernismmodernity.2016.23.4.771_nonotes.txt,"A plastic, adaptable subject turned into an ar...",0,0,1,0,0,6
10,b,177,sr.55.4.03_nonotes.txt,"Developing a model of what I will call ""trauma...",0,0,1,0,0,11
56,e,180,modernismmodernity.2016.23.3.631_nonotes.txt,At stake in this constellation of texts is the...,0,0,1,0,0,57
90,b,198,sn.48.2.03_nonotes.txt,"Interestingly, rather than modeling the type o...",0,0,1,0,0,91
112,b,219,sr.55.2.01_nonotes.txt,"In short, I claim that through the figure of t...",0,0,1,0,0,113


In [4]:
none_ids = none['ID'].tolist()

print(none['exemplification'].value_counts())
len(none_ids)

1    203
Name: exemplification, dtype: int64


203

- There are 203 rows with both 'neutral' and 'generalization' as 0; and 'exemplification' as 1.
### Set 'generalization' to 1 for those rows:

In [5]:
df.loc[df['ID'].isin(none_ids), ['generalization']] = 1
df.head(10) # see ID 6 - 'generalization' is changed to 1

Unnamed: 0,section,sent.no,filename,sentences,neutral,generalization,exemplification,attribution,conditional,ID
0,e,156,nlh.47.1.626118_nonotes.txt,"To this end, one of the main merits of Merleau...",1,0,0,0,0,1
1,b,207,ahr.2016.121.2.437_nonotes.txt,In their response they chastised her for her u...,1,0,0,0,0,2
2,b,180,ahr.2016.121.1.17_nonotes.txt,VOC officials who encountered these arguments ...,1,0,0,0,0,3
3,e,288,asr.2016.81.5.1039_nonotes.txt,In YEAR—for the first time since YEAR—white wo...,0,1,0,0,0,4
4,e,171,sr.55.2.05_nonotes.txt,"With its large type, ostentatious margins, an...",0,1,0,0,0,5
5,b,216,modernismmodernity.2016.23.4.771_nonotes.txt,"A plastic, adaptable subject turned into an ar...",0,1,1,0,0,6
6,e,291,modernismmodernity.2016.23.4.875_nonotes.txt,Hence the argument that we must not go down a ...,1,0,0,0,0,7
7,e,207,ci.42.2.684358_nonotes.txt,"And thus, although the fact that Niobe does no...",1,0,0,0,0,8
8,b,237,pmla.2016.131.1.84_nonotes.txt,"On the other hand there was lunarism, which he...",1,0,0,0,0,9
9,b,207,pmla.2016.131.5.1222_nonotes.txt,"Of this group of islands, Grande Comore, Anjou...",1,0,0,0,0,10


In [6]:
# Final check:
df.loc[(df['neutral']==0) & (df['generalization']==0)].shape

(0, 10)

In [7]:
# Write as new CSV:
# df.to_csv('/home/ndg/users/sbagga1/generalization/data/Gen_Sentences_Annotated_All_Final_Processed.csv', index=None)

## 1. Split the main DataFrame csv into multiple stratified fold csvs:

- Splits it into 3 folds. Each fold has a train, validation (for early stopping), and test set (for evaluation). Kind of like this: https://i.stack.imgur.com/aDaSc.png

In [7]:
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import numpy as np


def split_csv(train_ids, val_ids, test_ids, fold_number, out_path):
    """
    Splits the DataFrame into train, val, and test IDs.
    Creates three CSVs (in out_path) that correspond to the given fold_number.
    """
    train_df = DF.loc[DF['ID'].isin(train_ids)]
    print("Train DF: Shape = {} | Generalization Ratio = {}".format(train_df.shape, check_ratio(train_df)))
    val_df = DF.loc[DF['ID'].isin(val_ids)]
    print("Validation DF: Shape = {} | Generalization Ratio = {}".format(val_df.shape, check_ratio(val_df)))
    test_df = DF.loc[DF['ID'].isin(test_ids)]
    print("Test DF: Shape = {} | Generalization Ratio = {}".format(test_df.shape, check_ratio(test_df)))
    
    train_df.to_csv(out_path+'train_data_fold_'+str(fold_number)+'.csv', line_terminator='\n', index=False)
    val_df.to_csv(out_path+'val_data_fold_'+str(fold_number)+'.csv', line_terminator='\n', index=False)
    test_df.to_csv(out_path+'test_data_fold_'+str(fold_number)+'.csv', line_terminator='\n', index=False)
    
    
def check_ratio(df):
    """
    Returns % generalization labels in the input DataFrame 'df'
    """
    return df['generalization'].value_counts().to_dict()[1] / df.shape[0]

In [8]:
N_SPLITS = 3
DATA_PATH = '/home/ndg/users/sbagga1/generalization/Gen_Sentences_Annotated_All_Final.csv'

SEED = 42 # random seed for splits (for reproducibility)

In [9]:
DF = df
print(DF['generalization'].value_counts())
print(DF['neutral'].value_counts())
DF.head()

1    1759
0    1697
Name: generalization, dtype: int64
0    1759
1    1697
Name: neutral, dtype: int64


Unnamed: 0,section,sent.no,filename,sentences,neutral,generalization,exemplification,attribution,conditional,ID
0,e,156,nlh.47.1.626118_nonotes.txt,"To this end, one of the main merits of Merleau...",1,0,0,0,0,1
1,b,207,ahr.2016.121.2.437_nonotes.txt,In their response they chastised her for her u...,1,0,0,0,0,2
2,b,180,ahr.2016.121.1.17_nonotes.txt,VOC officials who encountered these arguments ...,1,0,0,0,0,3
3,e,288,asr.2016.81.5.1039_nonotes.txt,In YEAR—for the first time since YEAR—white wo...,0,1,0,0,0,4
4,e,171,sr.55.2.05_nonotes.txt,"With its large type, ostentatious margins, an...",0,1,0,0,0,5


In [10]:
X = DF['ID'].tolist()
y = []

# 'generalization' = 1 | 'neutral' = 0
for row in DF.iterrows():
    if row[1]['generalization'] == 1:
        y.append(1)
    elif row[1]['neutral'] == 1:
        y.append(0)
    else: print("CodingError: ", row)

X = np.array(X)
y = np.array(y)
print(len(X), len(y))

3456 3456


In [11]:
out_path = '/home/ndg/users/sbagga1/generalization/data/'
skf = StratifiedKFold(N_SPLITS, random_state=SEED)

fold_number = 1
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index] # Don't need y_test because test-set is fixed
    print("\nBIG X_train: ", len(X_train))
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train,
                                                     random_state=SEED)
    print("X_train: ", len(X_train), "X_val: ", len(X_val), "X_test: ", len(X_test))

    split_csv(X_train, X_val, X_test, fold_number, out_path)
    fold_number += 1


BIG X_train:  2303
X_train:  2072 X_val:  231 X_test:  1153
Train DF: Shape = (2072, 10) | Generalization Ratio = 0.5086872586872587
Validation DF: Shape = (231, 10) | Generalization Ratio = 0.5108225108225108
Test DF: Shape = (1153, 10) | Generalization Ratio = 0.5091066782307025

BIG X_train:  2304
X_train:  2073 X_val:  231 X_test:  1152
Train DF: Shape = (2073, 10) | Generalization Ratio = 0.5089242643511819
Validation DF: Shape = (231, 10) | Generalization Ratio = 0.5108225108225108
Test DF: Shape = (1152, 10) | Generalization Ratio = 0.5086805555555556

BIG X_train:  2305
X_train:  2074 X_val:  231 X_test:  1151
Train DF: Shape = (2074, 10) | Generalization Ratio = 0.508678881388621
Validation DF: Shape = (231, 10) | Generalization Ratio = 0.5108225108225108
Test DF: Shape = (1151, 10) | Generalization Ratio = 0.5091225021720244


In [12]:
1759.0 / 3456

0.5089699074074074

# fin.