# Setup

In [1]:
# mount drive 
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
cd "/content/gdrive/My Drive/Github/SubjectIndexing"

/content/gdrive/My Drive/Github/SubjectIndexing


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import pickle

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from lib.embeddings import Book2Vec

# Create Embeddings

In [4]:
# import dataset
df = pd.read_json('./data/dataset_B.json')
X_train, X_test, y_train, y_test = train_test_split(df.X, df.y, test_size=0.2, random_state=42)

In [5]:
# intialize embeddings module 
book2vec = Book2Vec()

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
X_embeddings = book2vec.get_embeddings(df.X)

500 - time: 2 min 12 sec


In [None]:
len(X_embeddings)

In [None]:
Book2Vec.save_embeddings(X_embeddings, './data/embeddings_B.pkl')

# Classification

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, df.y, test_size=0.2, random_state=42)

In [None]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
svm = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svm.fit(X_train, y_train)
accuracy_score(y_test, svm.predict(X_test))

In [None]:
svm.predict(X_test, y_test)

In [None]:
y_test.shape

# Plot 

In [None]:
# conduct PCA for graphical representation
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(X_embeddings)
print(pca.explained_variance_ratio_.sum())

In [None]:
df['2d_x'] = embeddings_2d[:,0]
df['2d_y'] = embeddings_2d[:,1]

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(df[df['y'] == 'B']['2d_x'], df[df['y'] == 'B']['2d_y'], c='tab:blue', s=48, label='BC - Logic')
plt.scatter(df[df['y'] == 'BC']['2d_x'], df[df['y'] == 'BC']['2d_y'], c='tab:orange', s=48, label='BC - Logic')
plt.scatter(df[df['y'] == 'BH']['2d_x'], df[df['y'] == 'BH']['2d_y'], c='tab:green', marker='s', s=48, label='BH - Aesthetics')
plt.scatter(df[df['y'] == 'BQ']['2d_x'], df[df['y'] == 'BQ']['2d_y'], c='tab:green', s=52, marker='^', label='BQ - Buddhism')
plt.scatter(df[df['y'] == 'BR']['2d_x'], df[df['y'] == 'BR']['2d_y'], c='tab:cyan', s=32, marker='D', label='BR - Christianity')
plt.title("Books in Section B (Embedding Space)")
plt.xlabel("PCA1")
plt.ylabel("PCA2")
#for i, txt in enumerate([str(x) for x in list(books_B.keys())]):
#    plt.annotate(txt, (embeddings_2d[:,0][i], embeddings_2d[:,1][i]))
plt.legend()
plt.show()