# Setup

In [1]:
# mount drive 
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
cd "/content/gdrive/My Drive/Github/SubjectIndexing"

/content/gdrive/My Drive/Github/SubjectIndexing


In [3]:
# general
import time
import itertools
import collections
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# custom library for transformers
from src.utils.embeddings import Book2Vec

# Import Data

In [4]:
# import dataset
df = pd.read_json('./data/dataset_B.json')
metadata = pd.read_json('./data/metadata.json')
X_embeddings = Book2Vec.load_embeddings('./work/embeddings_B_last4layers.pkl')
df['X_embeddings'] = list(X_embeddings)
dataset = df.merge(metadata, left_on='id', right_on='id')[['id', 'X', 'y', 'X_embeddings', 'subjects_new']]
dataset = dataset[dataset['y'].isin(['BR', 'B', 'BJ'])]

In [5]:
# remove subjects that only appeared once 
freq = collections.Counter(itertools.chain(*[x for x in dataset['subjects_new']]))
subjects_filtered = []
for row in dataset['subjects_new']:
    row_temp = []
    for c in row:
        if freq[c] != 1:
            row_temp.append(c)
    subjects_filtered.append(row_temp)
dataset['subjects_filtered'] = subjects_filtered

In [6]:
dataset[['id', 'X_embeddings', 'y', 'subjects_new', 'subjects_filtered']].head()

Unnamed: 0,id,X_embeddings,y,subjects_new,subjects_filtered
3,59,"[0.12214074, 0.06482455, 0.08797064, -0.114956...",B,"[Science, B, Methodology]","[Science, B, Methodology]"
5,130,"[0.0726224, 0.028099691, 0.11365156, -0.159741...",BR,"[Chesterton, G. K. (Gilbert Keith), 1874-1936,...","[Chesterton, G. K. (Gilbert Keith), 1874-1936,..."
8,272,"[0.061839424, 0.03096132, 0.09124701, -0.10134...",BR,"[Reformation, BR, Bible]","[Reformation, BR, Bible]"
10,274,"[0.0448586, 0.046716735, 0.09408784, -0.146873...",BR,"[BR, Reformation, Indulgences]","[BR, Reformation]"
13,368,"[0.032058243, 0.011893238, 0.055835266, -0.096...",BJ,"[Success, Temple University, BJ, Conwell, Russ...","[Success, BJ]"


In [7]:
# train test split
msk = np.random.rand(len(dataset)) < 0.8
dataset_train = dataset.copy()[msk]
dataset_test = dataset.copy()[~msk]

In [8]:
labels = sorted(list(set(dataset.y)))
class2label = {}
for i in range(len(labels)):
    class2label[labels[i]] = i
label2class = {v:k for k,v in class2label.items()}
y_train_labels = [class2label[l] for l in dataset_train.y]
y_test_labels = [class2label[l] for l in dataset_test.y]

# Classification

In [9]:
# classification models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# sklearn utilities
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix

In [10]:
X_train, X_test = list(dataset_train.X_embeddings), list(dataset_test.X_embeddings)

## Random Forest

In [11]:
# train model
start = time.time()
rf = RandomForestClassifier(n_estimators=512, max_depth=13, random_state=0)
rf_scores = cross_val_score(rf, X_train, y_train_labels, cv=5)
print(np.mean(rf_scores))
end = time.time()
print("Runtime:", round((end-start)//60), "min",  round((end-start)%60), "sec")

0.8179487179487179
Runtime: 0 min 7 sec


In [12]:
# test accuracy
rf.fit(X_train, y_train_labels)
y_pred = rf.predict(X_test)
accuracy_score(y_test_labels, y_pred)

0.7583892617449665

In [14]:
len(X_train)

132