## This contains the code for preprocessing the citation data. The preprocessed data will be used in node2vec algorithm to generate citation embeddings

In [1]:
# this file is used to preprocess some data we will use later
import os
import pandas as pd
import numpy as np
import time
import zipfile

In [2]:
# we first get all sentences data and put them together
citation_data_path_zipped = '/data/Dropbox/judge_embedding_data_sp18/citation.zip'
processed_data_path = '/data/Dropbox/judge_embedding_data_sp18/processed_citations'
citation_data_path = '../all_citations'
processed_citation_path = '/data/Dropbox/judge_embedding_data_sp18/processed_citations/citation_data.csv'

In [3]:
def convert_to_binary_citation_data(data_folder_path,processed_data_folder_path,
                                    data_binary_name="citation_data", 
                                    data_count_limit = 999999999,verbose=1):
    # data count limit is how many txt files do you want in total
    # data dump limit is every how many txt files do you want to do a binary dump
    start_time = time.time()
    # for test purposes, data limit can be set to indicate how much data to use
    data_count = 0
    # give the circuit court main folder's path, read all data
    folder_names = os.listdir(data_folder_path)
    folder_names.sort()
    #   judge_df = pandas.DataFrame(columns=["Judge_Name","Year","Sentence"])
    data_list = []
    save_part_index = 0
    
    finished = False
    
    for folder_name in folder_names: # for each folder
        if verbose > 0:
            print("now process:",folder_name,"current data count:",data_count,"time used:",time.time()-start_time)
        
        year = folder_name[-4:]
        data_file_names = os.listdir(os.path.join(data_folder_path,folder_name))
        

        if finished:
            break
        
        for file_name in data_file_names: # for each file
            file_name_without_txt = file_name[:-4]
            file_name_tokens = file_name_without_txt.split("_")
            
            if len(file_name_tokens)<3:
                print("file format incorrect at file:",file_name)
                continue
            
            caseid = file_name_tokens[1]
            case_type =file_name_tokens[2]
            judge_name = file_name_tokens[3] # we get the judge's name from the file name
            
            file_path = os.path.join(data_folder_path,folder_name,file_name)
            fpt = open(file_path,"r")
            
            for line in fpt:
                citation_name = line.strip()
                new_data_entry = [caseid,year,judge_name,case_type,citation_name]
                data_list.append(new_data_entry)
                data_count += 1
               
            #sentence = fpt.read()
            fpt.close()
            
            #new_data_entry = [caseid,year,judge_name,middle_part,sentence]
            #data_list.append(new_data_entry)
            #data_count += 1
            
            if data_count > data_count_limit: # for debugging purposes
                finished = True
                break
    
    df = pd.DataFrame(data_list,columns = ["caseid","year","judge_last_name","case_type","citation_name"])
        
    return df

In [4]:
# uncomment if you don't have preprocessed data
# total data is around 450,000, of total size about 6GB
#df_ready_to_csv = convert_to_binary_citation_data(citation_data_path, processed_data_path,verbose=1)
#df_ready_to_csv.to_csv(os.path.join(processed_data_path,"citation_data.csv"))


now process: citation_1891 current data count: 0 time used: 0.006191253662109375
now process: citation_1892 current data count: 0 time used: 0.03186917304992676
now process: citation_1893 current data count: 5 time used: 0.37296319007873535
now process: citation_1894 current data count: 11 time used: 0.6360573768615723
now process: citation_1895 current data count: 32 time used: 0.7228295803070068
now process: citation_1896 current data count: 39 time used: 0.8064494132995605
now process: citation_1897 current data count: 63 time used: 0.8965437412261963
now process: citation_1898 current data count: 109 time used: 1.0640411376953125
now process: citation_1899 current data count: 133 time used: 1.154466152191162
now process: citation_1900 current data count: 156 time used: 1.2659878730773926
now process: citation_1901 current data count: 194 time used: 1.3966481685638428
now process: citation_1902 current data count: 220 time used: 1.495680332183838
now process: citation_1903 current d

now process: citation_1991 current data count: 1339270 time used: 251.6859905719757
now process: citation_1992 current data count: 1414220 time used: 259.365531206131
now process: citation_1993 current data count: 1496670 time used: 267.97916531562805
now process: citation_1994 current data count: 1579884 time used: 275.5684063434601
now process: citation_1995 current data count: 1667299 time used: 286.1555709838867
now process: citation_1996 current data count: 1751136 time used: 293.7046115398407
now process: citation_1997 current data count: 1834992 time used: 300.60383319854736
now process: citation_1998 current data count: 1918420 time used: 306.9119622707367
now process: citation_1999 current data count: 2003764 time used: 314.20783710479736
now process: citation_2000 current data count: 2090495 time used: 321.1722490787506
now process: citation_2001 current data count: 2179997 time used: 326.87858390808105
now process: citation_2002 current data count: 2274894 time used: 333.643

In [5]:
df_citation_data = pd.read_csv(processed_citation_path, index_col=0)

  mask |= (ar1 == a)


In [6]:
df_citation_data.head()

Unnamed: 0,caseid,year,judge_last_name,case_type,citation_name
0,XFL742,1892,THAYER,contentMajOp,48 F. 62
1,XFLJLG,1892,THAYER,contentMajOp,16 F. 348
2,XFLJLG,1892,THAYER,contentMajOp,36 F. 668
3,XFL7H7,1892,MORROW,contentMajOp,49 F. 723
4,X9T9H7,1892,DEADY,contentMajOp,38 F. 789


In [8]:
df_citation_data.head()

Unnamed: 0,caseid,year,judge_last_name,case_type,citation_name
0,XFL742,1892,THAYER,contentMajOp,48 F. 62
1,XFLJLG,1892,THAYER,contentMajOp,16 F. 348
2,XFLJLG,1892,THAYER,contentMajOp,36 F. 668
3,XFL7H7,1892,MORROW,contentMajOp,49 F. 723
4,X9T9H7,1892,DEADY,contentMajOp,38 F. 789


In [7]:
case_id_list_unique = df_citation_data.caseid.unique()
len(case_id_list_unique)

281593

In [8]:
citation_name_unique = df_citation_data.citation_name.unique()
len(citation_name_unique)

350840

In [9]:
df_citation_data = df_citation_data[['caseid', 'citation_name']]

In [10]:
temp_str = df_citation_data.to_csv('citation.edgelist', sep='\t', index=False, header=False)

In [11]:
case_id_to_index = dict( zip( case_id_list_unique, list(range(len(case_id_list_unique)))))

In [12]:
case_id_list_unique[5]

'XEIVTFQNB5G0'

In [13]:
case_id_to_index['XEIVTFQNB5G0']

5

In [14]:
citation_name_to_index = dict( zip( citation_name_unique, list(range(len(citation_name_unique)))))

In [16]:
citation_name_unique[5]

'48 F. 21'

In [17]:
citation_name_to_index['48 F. 21']

5

In [20]:
for index, row in df_citation_data.iterrows():
    print(row['caseid'] + ' ' + row['citation_name'])
    
    line = str(case_id_to_index[row['caseid']]) + ' ' + str(citation_name_to_index[row['citation_name']]) + '\n'
    print(line)
    if index > 5:
        break

XFL742 48 F. 62
0 0

XFLJLG 16 F. 348
1 1

XFLJLG 36 F. 668
1 2

XFL7H7 49 F. 723
2 3

X9T9H7 38 F. 789
3 4

XEIVMJQNB5G0 48 F. 21
4 5

XEIVTFQNB5G0 51 F. 130
5 6



In [21]:
with open('citation.edgelist', 'w') as f:
    for index, row in df_citation_data.iterrows():
        line = str(case_id_to_index[row['caseid']]) + ' ' + str(citation_name_to_index[row['citation_name']]) + '\n'
        f.write(line)
print("done")
    

done


In [31]:
with open('cases_citegraph.edgelist', 'w') as f:
    for index, row in df_citation_data.iterrows():
        line =  str(citation_name_to_index[row['citation_name']]) + ' ' + str(case_id_to_index[row['caseid']]) + '\n'
        f.write(line)
print("done")
    

done


In [None]:
!ls ../

In [22]:
import pickle

with open('case_id_to_index.pickle', 'wb') as handle:
    pickle.dump(case_id_to_index, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("done")

done


In [25]:
!ls /data/Dropbox/judge_embedding_data_sp18/citation_graph_data_node2vec

case_id_to_index.pickle  citation.edgelist  citation_name_to_index.pickle


In [30]:
!scp case_id_to_index.pickle /data/Dropbox/judge_embedding_data_sp18/citation_graph_data_node2vec/.

In [40]:
import pickle

with open('case_id_to_index.pickle', 'wb') as handle:
    pickle.dump(citation_name_to_index, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("done")

done


In [23]:
import pickle

with open('citation_name_to_index.pickle', 'wb') as handle:
    pickle.dump(citation_name_to_index, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("done")

done


# Producing Node2Vec
To produce node2vec, install node2vec package from: https://github.com/aditya-grover/node2vec .

Run 'python node2vec_path/src/main.py --input cases_citegraph.edgelist --output citation_embeddings.emd