In [1]:
# libraries
import json
import lzma
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from IPython.core.display import display, HTML
import re
from nltk.tokenize import RegexpTokenizer

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

# pd.options.display.max_columns = 999
# pd.options.display.max_rows = 999

In [2]:
# defining a fucnction to remove \n and HTML tags
def text_cleaner(text):
    text_divided = text.splitlines()
    text_divided_clean = " ".join(text_divided)
    return text_divided_clean

# setting up tokenizer
tokenizer = RegexpTokenizer('\s+', gaps=True)

In [3]:
def get_data(state):
    # reading json files
    cases = []
    with lzma.open(state + '/data/data.jsonl.xz', 'r') as jsonl_file:
        for case in jsonl_file:
            cases.append(json.loads(str(case, 'utf-8')))

    df = pd.DataFrame(cases).sort_values('decision_date').reset_index(drop=True)
    df['decision_date'] = pd.to_datetime(df['decision_date'])

    # parsing data
    storage = []
    for i in range(df.shape[0]):

        judges = df.iloc[i,15]['data']['judges']
        attorneys = df.iloc[i,15]['data']['attorneys']
        headnotes = df.iloc[i,15]['data']['head_matter']
        if df.iloc[i,15]['data']['opinions'] != []:
            opinions = df.iloc[i,15]['data']['opinions'][0]['text']

        headnotes_clean = text_cleaner(headnotes)
        opinions_clean = text_cleaner(opinions)

        storage.append({'judges': judges,
                        'attorneys': attorneys,
                        'headnotes': headnotes_clean,
                        'opinions': opinions_clean})
    df_parsed = pd.DataFrame(storage)
    df = df_parsed.merge(df, left_index=True, right_index=True)

    # tokenizing headnotes and opinions
    df['headnotes'] = df['headnotes'].apply(lambda x: tokenizer.tokenize(x))
    df['opinions'] = df['opinions'].apply(lambda x: tokenizer.tokenize(x))
    df['headnotes_num_tokens'] = [len(notes) for notes in df['headnotes']]
    df['opinions_num_tokens'] = [len(opinions) for opinions in df['opinions']]

    return df

In [None]:
states = ['Arkansas', 'Illinois', 'New Mexico', 'North Carolina']

df_ar = get_data(states[0])
df_il = get_data(states[1])
df_nm = get_data(states[2])
df_nc = get_data(states[3])

dfs = [df_ar, df_il, df_nm, df_nc]

In [1]:
f, ax = plt.subplots(2,4, figsize=[18,8])
plt.subplots_adjust(hspace=0.4)
for c in range(len(states)):
    df = dfs[c]
    ax[0][c].plot(df.decision_date, df.headnotes_num_tokens, alpha=0.5)
    ax[0][c].set_title('Headnotes' + states[c], fontsize=20)
    ax[0][c].set_xlabel('Year of Decision', fontsize=16)
    ax[0][c].set_ylabel('Token Count', fontsize=16)

    ax[1][c].plot(df.decision_date, df.opinions_num_tokens)
    ax[1][c].set_title('Opinions' + states[c], fontsize=20)
    ax[1][c].set_xlabel('Year of Decision', fontsize=16)
    ax[1][c].set_ylabel('Token Count', fontsize=16)
    plt.show()