In [4]:
from transformers import BertTokenizer, BertModel
import torch
import json
import pandas as pd



# Gather data, put into pandas dataframe

In [5]:
TRAIN_DATA_PATH = "./data/train.json"
with open(TRAIN_DATA_PATH, 'r') as file:
    data = json.load(file)

In [12]:
cumulative_texts = []
cumulative_labels = []
document_id = []
document_meta = []
sentence_id = []
start_loc = []
end_loc = []

for document in data:
    for key, value in document.items():
        current_id = document['id']
        current_meta = document['meta']['group']
        for annotation in document['annotations']:
            for sentence in annotation['result']:

                sent_id = sentence['id']

                text = sentence['value']['text']            #remove newline characters for a cleaner text
                cleaned_text = text.replace("\n","") 

                labels = sentence['value']['labels'][0]
                start = sentence['value']['start']
                end = sentence['value']['end']

                cumulative_texts.append(cleaned_text)
                cumulative_labels.append(labels)
                document_id.append(current_id)
                document_meta.append(current_meta)
                sentence_id.append(sent_id)
                start_loc.append(start)
                end_loc.append(end)

                

                
df = pd.DataFrame({'document_id': document_id,'sentence_id':sentence_id,'text': cumulative_texts, 'labels': cumulative_labels,'start': start_loc,'end': end_loc, 'meta':document_meta})

    
    

In [13]:
df

Unnamed: 0,document_id,sentence_id,text,labels,start,end,meta
0,1735,d7a902fe9c23417499a7ef782f9fbdeb,"IN THE HIGH COURT OF KARNATAKA, ...",PREAMBLE,0,116,Criminal
1,1735,8d41599e98424d9480c25109556a7d14,BEFORETHE HON'BLE MR.JUSTICE ANAND BYRAR...,PREAMBLE,116,678,Criminal
2,1735,e501424117da40a7935c2d9f2fb2fe38,This Criminal Appeal is filed under Section 37...,PREAMBLE,678,964,Criminal
3,1735,4825806388fe43d39f73354b10b5b32d,This appeal coming on for hearing this ...,PREAMBLE,964,1093,Criminal
4,1735,d6893a25f82948f8be17fc9e876fb716,Heard the learned Counsel for the appel...,NONE,1093,1180,Criminal
...,...,...,...,...,...,...,...
115939,4090,d20ea0cee1e1491498f05197f9ef19e0,So Section 132 of the Evidence Act sufficient...,RATIO,21494,21601,Criminal
115940,4090,1ec9d9a5c1a64a8bb423c5c8d6497892,"For the reasons aforesaid, the appeal is allo...",RPC,21601,21652,Criminal
115941,4090,d1f14db1ebff454ba2f13fdc90dc875f,"The judgment and order dated April 27, 1987 pa...",RPC,21653,21761,Criminal
115942,4090,8adf1005d8c7458093a3f5599df282ca,R.S.S.,NONE,21761,21769,Criminal


In [25]:
df['text'][0]

'      IN THE HIGH COURT OF KARNATAKA,          CIRCUIT BENCH AT GULBARGADATED THIS THE 22ND DAY OF FEBRUARY, 2013'

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

### Grab text from DF, push to list

In [35]:
sentence_data = []

for text in df['text']:
    sentence_data.append(text)

### Tokenize text through BERT's tokenizer

In [36]:
tokenized_inputs = tokenizer(sentence_data, return_tensors='pt', padding=True, truncation=True)


In [39]:
tokenized_inputs

{'input_ids': tensor([[  101,  1999,  1996,  ...,     0,     0,     0],
        [  101,  2077, 10760,  ...,     0,     0,     0],
        [  101,  2023,  4735,  ...,     0,     0,     0],
        ...,
        [  101,  1996,  8689,  ...,     0,     0,     0],
        [  101,  1054,  1012,  ...,     0,     0,     0],
        [  101,  5574,  3039,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [45]:
len(tokenized_inputs['input_ids'])

115944

In [46]:
with torch.no_grad():
    outputs = model(**tokenized_inputs)

: 

In [None]:
embeddings = outputs.last_hidden_state[:, 0, :]