# Deep learning notebook

This notebook contains the process for the deep learning part of the project. The idea is to achieve an n-dimensional representation of each text and get an average for a given time period in z-space, which can then be used for prediction.

In [1]:
from datetime import datetime

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.decomposition import PCA
from transformers import BertModel, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
tok = BertTokenizer.from_pretrained("KB/bert-base-swedish-cased")
model = BertModel.from_pretrained("KB/bert-base-swedish-cased")

Import data

In [2]:
df = pd.read_csv("../dataset/lawline_data.csv")

Remove newlines by splitting and combining texts. Then turn each document into a list of sentences.

In [None]:
df["text_clean"] = [" ".join(str(item).split()) for item in df["text"]]

In [None]:
doc_as_list = [item.split(". ") for item in df["text_clean"]]

In [None]:
doc_as_list = [
    [sent if sent[-1] in ["!", "?"] else sent + "." for sent in doc]
    for doc in doc_as_list
]

Turn each document into a word embedding using BERT

In [None]:
def get_embedding(document, tokenizer, model):
    results = tokenizer(
        document, max_length=512, truncation=True, padding=True, return_tensors="pt"
    )

    sentence_embs = model(**results)[1].detach().numpy()
    doc_emb = np.mean(sentence_embs, axis=0)

    return doc_emb

In [None]:
emb_from_doc = [get_embedding(doc, tok, model) for doc in doc_as_list]

In [None]:
emb_array = np.array(emb_from_doc)

In [None]:
emb_df = pd.DataFrame(emb_array)

In [None]:
emb_df.to_csv("../dataset/embeddings.csv", index=False)

Perform PCA to reduce dimensions of BERT embeddings.

In [None]:
# Create the PCA instance
pca = PCA(n_components=0.95)

# Fit on data
pca.fit(emb_array)

# Access values and vectors
print(pca.explained_variance_)

# Transform data
pca_small = pca.transform(emb_array)

In [None]:
df_reduced = pd.DataFrame(pca_small)

In [None]:
df_reduced.shape

Add dates to DF to down-sample

In [None]:
df_reduced["date"] = [
    datetime.strptime(date_str, "%d/%m/%Y") for date_str in df["date"]
]

In [None]:
data_down = df_reduced.copy()
data_down = data_down.resample("M", on="date").mean()

In [None]:
# include index to keep datetime
data_down.to_csv("../dataset/emb_down.csv")