# Setup

In [1]:
# mount drive 
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
cd "/content/gdrive/My Drive/Github/lc-classification"

/content/gdrive/My Drive/Github/lc-classification


In [3]:
import numpy as np
import pandas as pd

# Create datasets

In [4]:
# import fulldata
dataset = pd.read_json('./data/interim/dataset.json')

# import subclasses metadata
with open("./work/subclasses.txt", encoding='utf-8') as f:
    lines = f.readlines()
lc_subclasses = {}
for i in range(len(lines)):
    lc_subclasses[lines[i][0]] = lines[i][2:].replace(" ", "").replace("\n", "").split(",")

In [5]:
# extract subclass and class from books metadata
window = 4096
skip = 500
idx, X, y_class, y_subclass = [], [], [], []
for cls in lc_subclasses:
    for i in range(len(dataset)):
        for sub in dataset.subjects_new[i]:
            if sub in lc_subclasses[cls]:
                idx.append(dataset.id[i])
                tokens = ' '.join(dataset.text[i].split()[skip:4096+skip])
                X.append(tokens)
                y_class.append(cls) 
                y_subclass.append(sub)
fullset = pd.DataFrame({'id':idx, 'X':X, 'y_class':y_class, 'y_subclass':y_subclass})

In [6]:
fullset.head()

Unnamed: 0,id,X,y_class,y_subclass
0,200,by any symbol. when the alphabet was adopted b...,A,AE
1,247,"days. it is used for an, for the sake of eupho...",A,AG
2,248,"an attribute. atïtrib¶uïtive, n, (gram.) a wor...",A,AG
3,636,without scruple. ladies of gentle birth and ma...,A,AZ
4,713,"acquaintance with the pilgrims of the eighth, ...",A,AZ


In [7]:
fullset.groupby('y_class').count().id  # note that the classes are highly unbalanced

y_class
A     2600
B     3641
C      315
D     4500
G     1030
H     1423
J      323
K      198
L      284
M      399
N      902
P    28158
Q     1936
R      486
S      659
T     1242
U      214
V       81
Z      459
Name: id, dtype: int64

# Train test split

In [8]:
# train test split 
np.random.seed(0)
msk1 = np.random.rand(len(fullset)) < 0.8
test_set = fullset.copy()[~msk1]
train_set = fullset.copy()[msk1]
np.random.seed(1)
msk2 = np.random.rand(len(train_set)) < 0.8
val_set = train_set.copy()[~msk2]
train_set = train_set.copy()[msk2]

# Sampling

In [9]:
# undersample majority and oversample minority classes
train_sample = []
for cls in list(set(fullset.y_class)):
    length = train_set[train_set['y_class'] == cls].shape[0]
    if length % 2 != 0:
        length -= 1
    if length >= 1000:
        sample = train_set[train_set['y_class'] == cls].sample(n=1000, replace=False, random_state=0)
    elif length < 1000:
        sample = train_set[train_set['y_class'] == cls].sample(n=max(length, 500), replace=(length<500), random_state=0)   
    train_sample.append(sample)

In [10]:
def sampling(data_set, max_sample_size):
    """
    Sample from dataset by class. Undersample if class n > max_sample_size.
    """
  
    sample_list = []
    for cls in list(set(fullset.y_class)):
        length = data_set[data_set['y_class'] == cls].shape[0]
        if length % 2 != 0:  # make sure that batch size is can be even
            length -= 1
        sample = data_set[data_set['y_class'] == cls].sample(n=min(length, max_sample_size), replace=False, random_state=0)
        sample_list.append(sample)

    return sample_list

In [11]:
# undersample majority class
val_sample = sampling(val_set, 100)
test_sample = sampling(test_set, 100)

In [12]:
# transform to pandas df
train_set = pd.concat(train_sample)
val_set = pd.concat(val_sample)
test_set = pd.concat(test_sample)
fullset.shape, train_set.shape, val_set.shape, test_set.shape

((48850, 4), (12960, 4), (1472, 4), (1528, 4))

# Save dataframes

In [None]:
fullset.to_json('./data/final/full_set.json')
train_set.reset_index(drop=True, inplace=True)
train_set.to_json('./data/final/train_set.json')
val_set.to_json('./data/final/val_set.json')
test_set.to_json('./data/final/test_set.json')