# Opinion Text Clean

## Load data

In [1]:
import json
full_data = json.load(open('decision_text.json', 'r'))

In [2]:
full_data.keys()

dict_keys(['author', 'decision', 'opinion', 'cleaned_text'])

In [3]:
clean_text = full_data['cleaned_text']

In [3]:
op_text = full_data['opinion']

In [4]:
op_text[0][0:100]

'Apple appeals from the final decision of the International Trade Commission (ITC) that the asserted '

## Clean Text

### Remove Citations

In [6]:
import re
from reporters_db import EDITIONS
for key in list(EDITIONS.keys()):
    EDITIONS[key.replace(" ", "")]=EDITIONS[key].replace(" ", "")
# regular expression to describe citation pattern
CITATION_PTN = r"""
(?:[\s,:\(]|^)
(
(\d+)\s+
({reporters})(\s|[a-z])+
(\d+)
)
""".format(reporters='|'.join([re.escape(i) for i in EDITIONS]))
CITATION_PTN_RE = re.compile(CITATION_PTN, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)

In [7]:
REGULATION_PTN = r"""
(
(\d+)\s*
(U\.?S\.?C\.?|C\.?F\.?R\.?)\s*
(Sec(?:tion|\.)?|§)?\s*
(\d+[\da-zA-Z\-]*)
)"""
REGULATION_PTN_RE = re.compile(REGULATION_PTN, 
                               re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)

PUBLIC_LAW_PTN = r"""
(
Pub(?:lic|\.)\s+L(?:aw|\.)(?:\s+No.?)?\s+\d+\-\d+
|
\d+\s+Stat\.\s+[\d-]+
)
"""
PUBLIC_LAW_PTN_RE = re.compile(PUBLIC_LAW_PTN, 
                               re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
PUBLIC_LAW_SUB_RE = re.compile(r'.+?(\d+\-\d+)',
                               re.MULTILINE | re.DOTALL)




In [8]:
FED_CIR_RE =  re.compile(r'\(Fed.Cir.\d*\)', re.MULTILINE | re.DOTALL)

In [9]:
MARK_RE = re.compile(r'©\s\d+\sThomson Reuters.', re.MULTILINE | re.DOTALL)

In [10]:
replacements = [
    (CITATION_PTN_RE, ''),
    (REGULATION_PTN_RE, ''),
    (PUBLIC_LAW_PTN_RE, ''),
    (FED_CIR_RE, ''),
    (MARK_RE, ''),
    (r'[^\w]', ' '),
    (r'\d+', ' '),
    (r'\s+[A-Za-z]\s+', ''),
    (r'\s+', ' ')
]

def clean_citation(input_str):
    for old, new in replacements:
        input_str = re.sub(old, new, input_str)
    return input_str

In [11]:
len(op_text)

854

In [12]:
cleaned_text = [None]*len(op_text)

for i in range(len(op_text)):
    if op_text[i] != None:
        cleaned_text[i]= clean_citation(op_text[i])

# Write to json file

In [14]:
full_data.update({'cleaned_text':cleaned_text})

In [15]:
full_data.keys()

dict_keys(['author', 'decision', 'opinion', 'cleaned_text'])

In [16]:
json.dump(full_data, open('decision_text.json', 'w'))

# Write to csv

In [5]:
import pandas as pd

In [7]:
text_import = pd.DataFrame(
    {'cleaned_text': clean_text
    })


In [8]:
text_import.shape

(854, 1)

In [9]:
text_import.to_csv('cleaned_text.csv',sep='\t')