# Practice Notebook

The purpose of this notebook is to

1. Provide a practice space for me to test out machine-learning techniques I'm familiar with
2. List the tested models in an explanatory way for future reference

The KPIs, so to speak, are for each model

1. The working, annotated code for the model
2. A rudimentary performance evaluation


# Setup


## Install


In [None]:
%%bash
if [ ! -e "./sciterra" ] ; then
    git clone https://github.com/zhafen/sciterra.git
fi

In [None]:
try:
    from sciterra import Atlas
except ImportError:
    !cd sciterra; pip install -e .

## Imports


In [None]:
import os
import importlib

In [None]:
import numpy as np
import pandas as pd
import tqdm

In [None]:
import torch
from torch import nn
import torch.optim as optim

In [None]:
# Get cpu, gpu or mps device for training.
DEVICE = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Using {DEVICE} device")

In [None]:
from sklearn.utils import check_random_state
from sklearn.model_selection import cross_val_score
import sklearn.metrics
from sklearn.base import BaseEstimator

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import sciterra
from sciterra import Atlas
from sciterra import Cartographer
from sciterra.librarians import SemanticScholarLibrarian, ADSLibrarian
from sciterra.vectorization import SciBERTVectorizer  # among others
from sciterra.mapping.tracing import iterate_expand
import sciterra.mapping.publication as publication

## Settings


In [None]:
# Settings
config = dict(
    # root_dir = '',
    root_dir="/Users/zhafensaavedra/data",
    atlas_dirpath="kaggle/working/atlas",
    starting_bibtex="/kaggle/input/basic-bibs/hafenLowredshiftLymanLimit2017.bib",
    target_size_per_field=50,
    construct_dataset=False,
    classification_field="Linguistics",
    included_fields=["Linguistics", "Physics"],
    seed=15235,
    cv=5,
    scoring="balanced_accuracy",
)

In [None]:
config["atlas_dirpath"] = os.path.join(config["root_dir"], config["atlas_dirpath"])
config["atlas_dirpath"]

In [None]:
random_state = check_random_state(config["seed"])

In [None]:
# Build an Atlas consisting of publications from several datasets
if config["construct_dataset"]:

    # Create a cartographer with a Semantic Scholar librarian and a SciBERT vectorizer
    crt = Cartographer(
        librarian=SemanticScholarLibrarian(),
        vectorizer=SciBERTVectorizer(),
    )

    atl = Atlas([])
    bad_bibs = []
    for filename in os.listdir("/kaggle/input/basic-bibs/"):
        atl_i = crt.bibtex_to_atlas(f"/kaggle/input/basic-bibs/{filename}")

        if len(atl_i.publications) == 0:
            bad_bibs.append(filename)
            continue

        # Assuming the initial atlas contains just one publication
        (atl_i.center,) = atl_i.publications.keys()
        atl_i = iterate_expand(
            atl=atl_i,
            crt=crt,
            atlas_dir="atlas",
            target_size=config["target_size_per_field"],
            center=atl_i.center,
        )
        atl.publications = {**atl.publications, **atl_i.publications}

    atl = crt.project(atl)
    atl.save(config["atlas_dirpath"])
else:
    atl = Atlas.load(config["atlas_dirpath"])

In [None]:
# Get the embeddings
embeddings = atl.projection.embeddings

In [None]:
# Build dataframe from atlas
df_data = {field: [] for field in publication.FIELDS}
df_data["field_of_study"] = []
for identifier in atl.projection.index_to_identifier:
    pub = atl.publications[identifier]
    for field in publication.FIELDS:
        df_data[field].append(getattr(pub, field))
    df_data["field_of_study"].append(pub.fields_of_study[0])
df = pd.DataFrame(df_data)

In [None]:
# Filter out publications that are not in the included fields
df = df.loc[df["field_of_study"].isin(config["included_fields"])]

In [None]:
# Get one-hot-encodings for columns
df = pd.concat(
    [df, pd.get_dummies(df["field_of_study"]).astype(int)],
    axis=1,
)

# Data split


In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from torch.utils.data import TensorDataset, DataLoader

In [None]:
# Validation and training split
inds_train, inds_test = train_test_split(df.index, test_size=0.2)

In [None]:
X_train = embeddings[inds_train]
X_test = embeddings[inds_test]

In [None]:
y_train = df.loc[inds_train, config["classification_field"]]
y_test = df.loc[inds_test, config["classification_field"]]

In [None]:
cv = StratifiedKFold(n_splits=config["cv"], shuffle=True, random_state=random_state)

In [None]:
n, n_features = embeddings.shape
n, n_features

# Data overview


In [None]:
df["field_of_study"].value_counts()

# Task: Text Classification


In [None]:
results = {}

## Model: Baseline

The baseline model will be a weighted coin flip


### Build


In [None]:
class Baseline(BaseEstimator):

    def fit(self, X, y):
        """Baseline is we just use the fraction of classifications as a binomial probability."""

        self.prob_ = y.sum() / y.size

    def predict(self, X):

        return random_state.binomial(n=1, p=self.prob_, size=len(X))

### Evaluate


In [None]:
# Make the estimator
model_name = "baseline"
model = Baseline()

In [None]:
result = {}

In [None]:
# Crossval score
result["cross_val_score"] = cross_val_score(
    model, X_train, y_train, cv=cv, scoring=config["scoring"]
)

In [None]:
# Full prediction
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Confusion matrix
confusion_matrix = sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)
(tn, fp), (fn, tp) = confusion_matrix
confusion_matrix

In [None]:
# Accuracy, etc
recall_positive = tp / (tp + fn)
recall_negative = tn / (tn + fp)
balanced_accuracy = 0.5 * (recall_positive + recall_negative)
result["final_accuracy"] = balanced_accuracy
print(
    f"positive recall: {recall_positive:.2f}\n"
    f"negative recall: {recall_negative:.2f}\n"
    f"balanced accuracy: {balanced_accuracy:.2f}\n"
)

In [None]:
results[model_name] = result

## Model: KNN


### Build


In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Make the estimator
model_name = "KNN"
model = KNeighborsClassifier(n_neighbors=5)

### Evaluate


In [None]:
result = {}

In [None]:
# Crossval score
result["cross_val_score"] = cross_val_score(
    model, X_train, y_train, cv=cv, scoring=config["scoring"]
)

In [None]:
# Full prediction
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Confusion matrix
confusion_matrix = sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)
(tn, fp), (fn, tp) = confusion_matrix
confusion_matrix

In [None]:
# Accuracy, etc
recall_positive = tp / (tp + fn)
recall_negative = tn / (tn + fp)
balanced_accuracy = 0.5 * (recall_positive + recall_negative)
result["final_accuracy"] = balanced_accuracy
print(
    f"positive recall: {recall_positive:.2f}\n"
    f"negative recall: {recall_negative:.2f}\n"
    f"balanced accuracy: {balanced_accuracy:.2f}\n"
)

In [None]:
results[model_name] = result

## Model: Linear Regression "Classification"

Traditional linear regression, but enabled for classification. This might actually be the same thing as logistic regression?


### Build


In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
class LinearClassification(LinearRegression):

    def predict(self, X):

        regression_pred = super().predict(X)

        classification_pred = np.zeros(regression_pred.shape)
        classification_pred[regression_pred > 0.5] = 1

        return classification_pred

In [None]:
# Make the estimator
model_name = "linear_regression"
model = LinearClassification()

### Evaluate


In [None]:
result = {}

In [None]:
# Crossval score
result["cross_val_score"] = cross_val_score(
    model, X_train, y_train, cv=cv, scoring=config["scoring"]
)

In [None]:
# Full prediction
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Confusion matrix
confusion_matrix = sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)
(tn, fp), (fn, tp) = confusion_matrix
confusion_matrix

In [None]:
# Accuracy, etc
recall_positive = tp / (tp + fn)
recall_negative = tn / (tn + fp)
balanced_accuracy = 0.5 * (recall_positive + recall_negative)
result["final_accuracy"] = balanced_accuracy
print(
    f"positive recall: {recall_positive:.2f}\n"
    f"negative recall: {recall_negative:.2f}\n"
    f"balanced accuracy: {balanced_accuracy:.2f}\n"
)

In [None]:
results[model_name] = result

## Model: Logistic Regression


### Build


In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Make the estimator
model_name = "logistic_regression"
model = LogisticRegression()

### Evaluate


In [None]:
result = {}

In [None]:
# Crossval score
result["cross_val_score"] = cross_val_score(
    model, X_train, y_train, cv=cv, scoring=config["scoring"]
)

In [None]:
# Full prediction
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Confusion matrix
confusion_matrix = sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)
(tn, fp), (fn, tp) = confusion_matrix
confusion_matrix

In [None]:
# Accuracy, etc
recall_positive = tp / (tp + fn)
recall_negative = tn / (tn + fp)
balanced_accuracy = 0.5 * (recall_positive + recall_negative)
result["final_accuracy"] = balanced_accuracy
print(
    f"positive recall: {recall_positive:.2f}\n"
    f"negative recall: {recall_negative:.2f}\n"
    f"balanced accuracy: {balanced_accuracy:.2f}\n"
)

In [None]:
results[model_name] = result

## Model: Single Linear Layer

Same thing as traditional linear regression, but trained with gradient descent.


### Build


In [None]:
class TorchLinearEstimator(BaseEstimator):

    def __init__(self, lr=1e-4, n_epoch=1000):

        self.lr = lr
        self.n_epoch = n_epoch

    def fit(self, X, y):

        X = torch.Tensor(X).to(DEVICE)
        y = torch.Tensor(y.values).to(DEVICE)

        # Create parameters and turn on gradient tracking
        w = torch.rand(n_features).to(DEVICE).requires_grad_()
        b = torch.rand(1).to(DEVICE).requires_grad_()

        # Training loop
        losses = []
        i_best = None
        for i in range(self.n_epoch):

            # Make the prediction
            y_pred = self.linear_model(X, w, b)

            # Convert the prediction into a classification with sigmoid
            y_pred_scaled = torch.sigmoid(y_pred)

            # Get the loss
            loss = self.loss(y_pred_scaled, y)

            # Calculate the gradient
            loss.backward()

            # Modify the parameters
            w.data -= w.grad.data * self.lr
            b.data -= b.grad.data * self.lr

            # Mark when the training ceases improving and story a copy of the parameters
            if i_best is None:
                if i > 0:
                    if float(loss) > float(losses[-1]):
                        i_best = i
                        w_best = w.clone()
                        b_best = b.clone()

            # Store
            losses.append(loss.cpu().detach().numpy())

        # If i_best is still None give the final value
        if i_best is None:
            i_best = i
            w_best = w.clone()
            b_best = b.clone()

        self.i_best_ = i_best
        self.w_ = w_best
        self.b_ = b_best
        self.losses_ = np.array(losses)

        return self

    def predict(self, X):

        X = torch.Tensor(X).to(DEVICE)

        # Make the prediction
        y_pred_reg = self.linear_model(X, self.w_, self.b_)
        y_pred_scaled = torch.sigmoid(y_pred_reg)

        # Convert the prediction into a classification
        y_pred = (y_pred_scaled.cpu().detach().numpy() > 0.5).astype("int")

        return y_pred

    def linear_model(self, X, weights=None, bias=None):
        """The model itself."""

        return X @ weights + bias

    def loss(self, y_pred, y_actual):
        return ((y_pred - y_actual) ** 2.0).mean()

In [None]:
# Make the estimator
model_name = "linear_model"
model = TorchLinearEstimator()

### Evaluate


In [None]:
result = {}

In [None]:
# Crossval score
result["cross_val_score"] = cross_val_score(
    model, X_train, y_train, cv=cv, scoring=config["scoring"]
)
result["cross_val_score"]

In [None]:
# Full prediction
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Confusion matrix
confusion_matrix = sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)
(tn, fp), (fn, tp) = confusion_matrix
confusion_matrix

In [None]:
# Accuracy, etc
recall_positive = tp / (tp + fn)
recall_negative = tn / (tn + fp)
balanced_accuracy = 0.5 * (recall_positive + recall_negative)
result["final_accuracy"] = balanced_accuracy
print(
    f"positive recall: {recall_positive:.2f}\n"
    f"negative recall: {recall_negative:.2f}\n"
    f"balanced accuracy: {balanced_accuracy:.2f}\n"
)

In [None]:
# Plot the training progress (loss curve)
fig = plt.figure()
ax = plt.gca()

ax.plot(
    range(len(model.losses_)),
    model.losses_,
)

ax.scatter(model.i_best_, model.losses_[model.i_best_])

ax.set_xlabel("epoch")
ax.set_ylabel("mse")

In [None]:
results[model_name] = result

## Model: Single Linear Layer (with Optimizers)

With no nonlinearity between them this should be only as effective as using one linear layer.


In [None]:
class TorchEstimator(BaseEstimator):

    def __init__(
        self,
        model,
        lr: float = 1e-4,
        epochs: int = 100,
        batch_size: int = 64,
        device: str = DEVICE,
        optimizer=optim.Adam,
    ):

        self.model = model.to(device)
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.device = device
        self.optimizer = optimizer

    def fit(
        self,
        X: np.ndarray,
        y: pd.Series,
        X_val: np.ndarray = None,
        y_val: pd.Series = None,
    ) -> "TorchEstimator":

        self.model.train()

        # Prep data
        X = torch.Tensor(X).to(self.device)
        y = torch.Tensor(y.values).to(self.device)
        dataset = TensorDataset(X, y)
        dataloader = DataLoader(dataset, batch_size=self.batch_size)

        # Prep validation data
        if X_val is not None and y_val is not None:
            X_val = torch.Tensor(X_val).to(self.device)
            y_val = torch.Tensor(y_val.values).to(self.device)
            dataset_val = TensorDataset(X_val, y_val)
            dataloader_val = DataLoader(dataset_val, batch_size=self.batch_size)
            self.losses_val_ = []

        params = [param.requires_grad_() for param in self.model.parameters()]
        optimizer = self.optimizer(params, lr=self.lr)

        # Training loop
        self.losses_ = []
        for i in tqdm.tqdm(range(self.epochs)):
            self.model.train()
            loss = 0.0
            for j, (X_j, y_j) in enumerate(dataloader):

                # Make the prediction
                pred_j = self.model(X_j)

                # Get the loss
                loss_j = self.loss_fn(pred_j, y_j)

                # Backpropagation
                optimizer.zero_grad()
                loss_j.backward()
                optimizer.step()

                loss += loss_j.cpu().detach().numpy() * len(y_j)
            # Store for later use
            loss /= len(y)
            self.losses_.append(loss)

            # Evaluation for validation data
            self.model.eval()
            loss_val = 0.0
            if X_val is not None and y_val is not None:
                for j, (X_val_j, y_val_j) in enumerate(dataloader_val):

                    # Make the prediction
                    pred_val_j = self.model(X_val_j)

                    # Get the loss
                    loss_val_j = self.loss_fn(pred_val_j, y_val_j)

                    loss_val += loss_val_j.cpu().detach().numpy() * len(y_val_j)

                # Store for later use
                loss_val /= len(y_val)
                self.losses_val_.append(loss_val)

        return self

    def predict(self, X):

        X = torch.Tensor(X).to(self.device)

        # Make the prediction
        self.model.eval()
        pred = self.model(X)

        # Convert the prediction into a classification
        y_pred = (pred.cpu().detach().numpy() > 0.5).astype("int")

        return y_pred

    def loss_fn(self, pred, y_actual):
        return ((pred - y_actual) ** 2.0).mean()

### Build


In [None]:
# Make the estimator
model_name = "optim_linear_model"
net = nn.Sequential(
    nn.Linear(n_features, 1),
    nn.Sigmoid(),
)
model = TorchEstimator(model=net, optimizer=optim.SGD)

### Evaluate


In [None]:
result = {}

In [None]:
# Full prediction
model.fit(X_train, y_train, X_val=X_test, y_val=y_test)
y_pred = model.predict(X_test)

In [None]:
# Confusion matrix
confusion_matrix = sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=y_pred)
(tn, fp), (fn, tp) = confusion_matrix
confusion_matrix

In [None]:
# Accuracy, etc
recall_positive = tp / (tp + fn)
recall_negative = tn / (tn + fp)
balanced_accuracy = 0.5 * (recall_positive + recall_negative)
result["final_accuracy"] = balanced_accuracy
print(
    f"positive recall: {recall_positive:.2f}\n"
    f"negative recall: {recall_negative:.2f}\n"
    f"balanced accuracy: {balanced_accuracy:.2f}\n"
)

In [None]:
# Plot the training progress (loss curve)
fig = plt.figure()
ax = plt.gca()

ax.plot(
    range(len(model.losses_)),
    model.losses_,
    label="losses",
)

ax.plot(
    range(len(model.losses_val_)),
    model.losses_val_,
    color="r",
    label="validation losses",
)

ax.legend()

ax.set_xlabel("epoch")
ax.set_ylabel("loss")

In [None]:
# Crossval score
result["cross_val_score"] = cross_val_score(
    model, X_train, y_train, cv=cv, scoring=config["scoring"]
)
result["cross_val_score"]

In [None]:
results[model_name] = result

## Compare Models


In [None]:
# Format data
dfs = []
for key, value in results.items():

    df = pd.DataFrame(value)
    df["model_name"] = key
    dfs.append(df)
results_df = pd.concat(dfs)

In [None]:
fig = plt.figure(figsize=(len(results) * 2, 2))
ax = plt.gca()

sns.swarmplot(
    data=results_df,
    x="model_name",
    y="cross_val_score",
)

ax.set_ylabel(config["scoring"])
ax.set_ylabel(config["scoring"])