# Setup

In [None]:
# mount drive 
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
cd "/content/gdrive/My Drive/Github/lc-classification"

/content/gdrive/My Drive/Github/lc-classification


In [None]:
import numpy as np
import pandas as pd

Collecting transformers
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 56.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 54.3 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 30.9 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 5.6 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

# Create datasets

In [None]:
# import fulldata
dataset = pd.read_json('./data/interim/dataset.json')

# import subclasses metadata
with open("./work/subclasses.txt", encoding='utf-8') as f:
    lines = f.readlines()
lc_subclasses = {}
for i in range(len(lines)):
    lc_subclasses[lines[i][0]] = lines[i][2:].replace(" ", "").replace("\n", "").split(",")

In [None]:
# push classified books to json
window = 4096
skip = 300
idx, X, y_class, y_subclass = [], [], [], []
for cls in lc_subclasses:
    for i in range(len(dataset)):
        for sub in dataset.subjects_new[i]:
            if sub in lc_subclasses[cls]:
                idx.append(dataset.id[i])
                tokens = ' '.join(dataset.text[i].split()[skip:4096+skip])
                X.append(tokens)
                y_class.append(cls) 
                y_subclass.append(sub)
fullset = pd.DataFrame({'id':idx, 'X':X, 'y_class':y_class, 'y_subclass':y_subclass})

In [None]:
fullset.head()

Unnamed: 0,id,X,y_class,y_subclass
0,200,the original publisher have been changed or de...,A,AE
1,247,"tone in the model major scale (that in c), or ...",A,AG
2,248,"appropriate (to); to refer, as an effect to a ...",A,AG
3,636,in its pursuit; that millions of people become...,A,AZ
4,713,or both combined. every one of these causes in...,A,AZ


In [None]:
fullset.groupby('y_class').count().id

y_class
A     2600
B     3641
C      315
D     4500
G     1030
H     1423
J      323
K      198
L      284
M      399
N      902
P    28158
Q     1936
R      486
S      659
T     1242
U      214
V       81
Z      459
Name: id, dtype: int64

# Train test split

In [None]:
# train test split 
np.random.seed(0)
msk1 = np.random.rand(len(fullset)) < 0.8
test_set = fullset.copy()[~msk1]
train_set = fullset.copy()[msk1]
np.random.seed(1)
msk2 = np.random.rand(len(train_set)) < 0.8
val_set = train_set.copy()[~msk2]
train_set = train_set.copy()[msk2]

# Sampling

In [None]:
train_sample = []
for cls in list(set(fullset.y_class)):
    length = train_set[train_set['y_class'] == cls].shape[0]
    if length % 2 != 0:
        length -= 1
    if length >= 1000:
        sample = train_set[train_set['y_class'] == cls].sample(n=1000, replace=False, random_state=0)
    elif length < 1000:
        sample = train_set[train_set['y_class'] == cls].sample(n=max(length, 500), replace=(length<500), random_state=0)   
    train_sample.append(sample)

In [None]:
val_sample = []
for cls in list(set(fullset.y_class)):
    length = val_set[val_set['y_class'] == cls].shape[0]
    if length % 2 != 0:
        length -= 1
    sample = val_set[val_set['y_class'] == cls].sample(n=min(length, 100), replace=False, random_state=0)
    val_sample.append(sample)

In [None]:
test_sample = []
for cls in list(set(fullset.y_class)):
    length = test_set[test_set['y_class'] == cls].shape[0]
    if length % 2 != 0:
        length -= 1
    sample = test_set[test_set['y_class'] == cls].sample(n=min(length, 100), replace=False, random_state=0)
    test_sample.append(sample)

In [None]:
train_set = pd.concat(train_sample)
val_set = pd.concat(val_sample)
test_set = pd.concat(test_sample)
train_set.groupby('y_class').count().id

y_class
A    1000
B    1000
C     500
D    1000
G     654
H     918
J     500
K     500
L     500
M     500
N     568
P    1000
Q    1000
R     500
S     500
T     820
U     500
V     500
Z     500
Name: id, dtype: int64

In [None]:
fullset.shape, train_set.shape, val_set.shape, test_set.shape

((48850, 4), (12960, 4), (1472, 4), (1528, 4))

# Save dataframes

In [None]:
fullset.to_json('./data/final/full_set.json')
train_set.reset_index(drop=True, inplace=True)
train_set.to_json('./data/final/train_set.json')
val_set.to_json('./data/final/val_set.json')
test_set.to_json('./data/final/test_set.json')