In [12]:
# Phu, Andrea and Watcher
# 2018 Spring
import torch
import torch.nn as nn
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
from torch import FloatTensor, LongTensor
import numpy as np
import pandas as pd
import time
import os
import pickle
import string
import torch.utils.data as data_utils
import psutil
from random import shuffle
from sklearn.utils import shuffle as skshuffle
import random

torch.manual_seed(44)
random.seed(44)
np.random.seed(44)

## We first read in the data

In [2]:
processed_data_path = '/data/Dropbox/judge_embedding_data_sp18'
all_data_save_path = os.path.join(processed_data_path,"finalized_all_data","all_data_dict.pkl")
all_data_support_save_path = os.path.join(processed_data_path,"finalized_all_data","all_data_dict_support.pkl")
all_data_df_save_path = os.path.join(processed_data_path,"finalized_all_data","all_data_df.h5")
finished_embedding_folder_path = os.path.join(processed_data_path,'finished_judge_embedding')


In [3]:
all_data_dict = pickle.load(open(all_data_save_path,"rb"))

In [4]:
all_data_dict.keys()

dict_keys(['README', 'data_df', 'judge_id_to_index', 'judge_index_to_id', 'topic_glove_emb', 'citation_emb', 'case_to_citation_emb_index', 'citation_name_to_emb_index'])

In [5]:
topic_glove_emb = all_data_dict['topic_glove_emb'][0]
judgeId2Index = all_data_dict['judge_id_to_index'][0]
judgeIndex2Id = all_data_dict['judge_index_to_id'][0]
all_data_df = all_data_dict['data_df']
all_data_df

Unnamed: 0,caseid,year,middle_part,opinion_text,songername,judgeidentificationnumber,topic,geniss,opinion_vector,judge_embed_index,Circuit,date,affirmdummy,judge_decision
0,XB0PMNQNB5G0,1891,contentMajOp,"[0, 202, 1268, 13, 0, 4600, 3, 0, 247954, 128,...","PARDEE, DON A.",1830.0,Civil Procedure,4.0,"[-0.07445931306292275, 0.05843820821632856, -0...",0,5,1891-12-07,0.0,0
1,XFL757,1891,contentMajOp,"[15, 116, 4, 9238, 7, 22547, 3, 4092, 1219, 0,...","BROWN, HENRY BILLINGS",284.0,Civil Procedure,4.0,"[-0.07101038203845275, 0.029874279749715626, -...",1,6,1891-10-06,0.0,0
2,XB0PMRQNB5G0,1891,contentMajOp,"[215, 6, 68193, 3527, 255, 9670, 5, 12375, 3, ...","PARDEE, DON A.",1830.0,Contracts,7.0,"[-0.059691082627907024, 0.10039284279069777, -...",0,5,1891-12-07,0.0,0
3,XFL7KI,1891,contentMajOp,"[5, 0, 935, 4951, 5, 8151, 0, 4490, 3, 0, 305,...","LOCKE, JAMES WILLIAM",1410.0,Civil Procedure,4.0,"[-0.13210430623456784, 0.12352108086419746, -0...",2,5,1891-11-27,0.0,0
4,XB0OI7QNB5G0,1891,contentMajOp,"[5214, 10, 894, 2091, 4297, 21, 0, 14485, 1268...","NELSON, SUSAN RICHARD",3339.0,,,"[-0.09034438144397865, 0.06449958495506274, -0...",3,8,,0.0,0
5,XB0PHFQNB5G0,1891,contentMajOp,"[0, 377, 202, 3, 14963, 386, 92, 3, 745, 98, 0...","PARDEE, DON A.",1830.0,Torts,7.0,"[-0.06601334912248623, 0.019215826325411318, -...",0,5,1891-11-27,0.0,0
6,XB0PJRQNB5G0,1891,contentMajOp,"[202, 1268, 21, 0, 14485, 38, 14, 60389, 187, ...","BRUCE, JOHN",292.0,Corporate Law,7.0,"[-0.11053901197255082, 0.10235485285668748, -0...",4,5,1891-12-07,1.0,1
7,XFL6PE,1891,contentMajOp,"[15, 116, 4, 18428, 0, 305, 4, 0, 1399, 202, 3...","BROWN, HENRY BILLINGS",284.0,Civil Procedure,4.0,"[-0.05526258051118215, 0.049538111821086264, -...",1,6,1891-10-06,0.0,0
8,XFL6LS,1891,contentMajOp,"[3, 133, 10550, 10, 1609, 400000, 0, 58, 84, 4...","SAGE, GEORGE READ",2089.0,Patent Law,7.0,"[-0.09545400352665524, 0.10511163229642662, -0...",5,6,1891-10-06,0.0,0
9,XB0PNFQNB5G0,1891,contentMajOp,"[0, 2536, 3, 323, 1853, 128, 3, 50, 196, 7091,...","PARDEE, DON A.",1830.0,Mortgages & Liens,7.0,"[-0.08174452291734276, 0.08172237472717521, -0...",0,5,1891-12-07,1.0,1


## We will do a normalization of doc vectors, using center by topic-year
## That means for each doc vector, we reduce from it the topic-year vector it is related to.
## E.g. if a doc vector is from 1981 and topic of civil procedure, then we reduce it by the average vector of all vectors that are from 1981 and of topic civil procedure.

In [6]:
def get_year_topic_dict(df):
    # given all data df, give the year-topic average vectors
    # in the form of a year-topic dictionary
    # key is year-topic string, and value is a 300 dim vector
    # which is the average of all the cases with that year and topic
    
    yearTopicDict = {}
    yearTopicCount = {}
    
    n_instance = df.shape[0]
    for i in range(n_instance):
        case = df.iloc[i]
        year = case['year']
        topic = case['topic']
        yearTopic = str(year)+"-"+str(topic)
        opinion_vector = case['opinion_vector']
        
        if yearTopic not in yearTopicDict:
            yearTopicDict[yearTopic] = np.zeros(300) + opinion_vector
            yearTopicCount[yearTopic] = 1
        else:
            yearTopicDict[yearTopic] += opinion_vector
            yearTopicCount[yearTopic] += 1
            
    # now we accumulated all the cases
    # we do an average for each dictionary entry
    for k in yearTopicDict.keys():
        yearTopicDict[k] /= yearTopicCount[k]
        
    return yearTopicDict

def add_centered_vec_to_df(df,year_topic_dict,verbose=0):
    starttime = time.time()
    df['centered_opinion_vec'] = None
    n_instance = df.shape[0]
    for i in range(n_instance):
        if verbose and (i==2 or i%int(n_instance/20)==0 ) :
            print(i,time.time()-starttime)
        case = df.iloc[i]
        year = case['year']
        topic = case['topic']
        yearTopic = str(year)+"-"+str(topic)
        meanTopicYear = year_topic_dict[yearTopic]
        opinion_vector = case['opinion_vector']    
        centered_vec = opinion_vector - meanTopicYear
        
        df.iat[i,df.columns.get_loc('centered_opinion_vec')] = centered_vec
    

In [7]:
year_topic_dict = get_year_topic_dict(all_data_df)

In [8]:
add_centered_vec_to_df(all_data_df,year_topic_dict,verbose=1)

0 0.007372617721557617
2 0.00828862190246582
14775 4.25502610206604
29550 8.351835012435913
44325 12.538181781768799
59100 16.870078086853027
73875 20.86382532119751
88650 24.983389616012573
103425 29.175971508026123
118200 33.41242718696594
132975 37.516276121139526
147750 41.65259909629822
162525 45.983999490737915
177300 50.15865993499756
192075 54.2484712600708
206850 58.447778940200806
221625 62.58998370170593
236400 66.68147850036621
251175 70.73505401611328
265950 74.99031376838684
280725 79.15841269493103
295500 83.12067317962646


## Now we have the data in memory, we first produce the naive judge embedding

In [9]:
def get_naive_judge_embedding(all_data_df,judgeIndex2Id, start_year, end_year, verbose=0):
    # init empty judge embedding
    # will only count year in [start_year,end_year)
    num_judge = len(judgeIndex2Id.keys())
    naive_judge_embedding = np.zeros((num_judge,300))
    for index in range(num_judge): # for each judge index
        if verbose==1 and (index+1)%300==0:
            print(index)
        cases_of_this_judge = all_data_df.loc[all_data_df['judge_embed_index']==index]
        # only count cases in the year range
        cases_of_this_judge = cases_of_this_judge.loc[lambda df: (df['year'] >= start_year) & (df['year'] < end_year)]
        num_cases = cases_of_this_judge.shape[0]
        for c in range(num_cases):
            # use iloc to iterate through all cases of this judge
            #WE NOW USE CENTERED OPINION VECTOR
            case_opinion_vector = cases_of_this_judge.iloc[c]['centered_opinion_vec'] # (300,)
            naive_judge_embedding[index,:] += case_opinion_vector # add to the judge embedding vector
        # now we added all cases, we need get average
        if num_cases != 0:
            naive_judge_embedding[index,:] /= num_cases
    return naive_judge_embedding

In [10]:
# get naive judge emb for all data points
naive_judge_embedding = get_naive_judge_embedding(all_data_df,judgeIndex2Id,0,3000,1)

299
599
899
1199
1499
1799


In [79]:
# save embedding to disk
centered_naive_emb_path = os.path.join(finished_embedding_folder_path,"centered_naive_emb.pkl")
pickle.dump(naive_judge_embedding,open(centered_naive_emb_path,"wb"))

## Now we will go for more sophisticated embeddings we will convert the data into numpy matrix and get ready for training.
## we first split the data

In [80]:


def train_val_test_split(data_df,number_judges,train_ratio=0.8,val_ratio=0.1,verbose=0,toshuffle=True):
    starttime= time.time()
    sorted_all_data = data_df.sort_values(by='judge_embed_index')
    train_indexes = []
    val_indexes = []
    test_indexes = []
    currentiloc = 0
    for judge_index in range(number_judges):
        if verbose and judge_index%500 == 0:
            print(judge_index,time.time()-starttime)
        
        cases_of_this_judge = sorted_all_data.loc[sorted_all_data['judge_embed_index'] == judge_index]
        number_cases = cases_of_this_judge.shape[0]
        n_of_train = int(number_cases*train_ratio)
        n_of_val = int(number_cases*val_ratio)
        
        nextiloc = currentiloc+number_cases
        
        indexes = [i for i in range(currentiloc, nextiloc)]
        shuffle(indexes)
        
        train_indexes += indexes[:n_of_train]
        val_indexes += indexes[n_of_train:n_of_train+n_of_val]
        test_indexes += indexes[n_of_train+n_of_val:]
        
        currentiloc = nextiloc
    return skshuffle(data_df.loc[train_indexes]),skshuffle(data_df.loc[val_indexes]),skshuffle(data_df.loc[test_indexes])

In [81]:
data_train, data_val, data_test = train_val_test_split(all_data_df,2099,verbose=1)

0 0.15186810493469238
500 0.6101148128509521
1000 1.0499610900878906
1500 1.4779243469238281
2000 1.9162797927856445


## Now we get opinion vector, ruling label and topic vector to make data into X, y numpy matrix forms

In [82]:
data_train.shape

(235456, 15)

In [84]:
def df_to_Tensor(df,topic_glove_emb,verbose=0):
    # use this to convert a dataframe to torch tensor
    feature_dim = 300+300+2
    X = np.zeros((df.shape[0],feature_dim))
    y = np.zeros(df.shape[0])
    
    for i in range(df.shape[0]):
        if verbose and i%10000==0:
            print(i)
        
        data_entry = df.iloc[i]
        
        X[i,:300] = data_entry['centered_opinion_vec']
        topic = data_entry['topic']
        topic = str.lower(str(topic)) 
        
        if topic not in topic_glove_emb: # deal with any unknown topic
            topic = "<UNK>"
            
        X[i,300:600] = topic_glove_emb[topic]
        decision = data_entry['judge_decision']
        X[i,600+decision] = 1 # one hot representation for judge decision
        y[i] = data_entry['judge_embed_index']
        
    return FloatTensor(X),LongTensor(y)

In [85]:
X_train, y_train = df_to_Tensor(data_train,topic_glove_emb,1)
X_val, y_val = df_to_Tensor(data_val,topic_glove_emb,1)
X_test, y_test = df_to_Tensor(data_test,topic_glove_emb,1)


0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
0
10000
20000
0
10000
20000
30000


## Now we have X_train, y_train and in Tensor form, we can now create the model and start our training

In [89]:
BATCH_SIZE = 128
train_dataset = data_utils.TensorDataset(data_tensor=X_train,target_tensor=y_train)
train_loader = data_utils.DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True)


In [104]:
class Judge_emb_model(nn.Module):
    def __init__(self, input_dim, hidden_layer_dim, embedding_dim, num_judges):
        super(Judge_emb_model,self).__init__()
        # input is m x D
        self.linear1 = nn.Linear(input_dim,hidden_layer_dim) # D x H 
        self.dropout1 = nn.Dropout(p=0.5)
        self.linear2 = nn.Linear(hidden_layer_dim,hidden_layer_dim) # H x H
        self.dropout2 = nn.Dropout(p=0.5)
        
        self.judge_embedding = nn.Linear(embedding_dim,num_judges) # H x J
        # the output is m x J
        
        self.init_weights()

    def forward(self, X):
        out = F.relu(self.linear1(X))
        out = self.dropout1(out)
        out = F.relu(self.linear2(out))
        out = self.dropout2(out)
        out = self.judge_embedding(out)
        
        # now we have m x J matrix, for m data points, we can do log softmax
        log_prob = F.log_softmax(out,dim=1)
        return log_prob # for each opinion data, this is probability of which judge writes this opinion
    
    def init_weights(self):
        linear_layers = [self.linear1,self.linear2,self.judge_embedding]
        for layer in linear_layers:
            layer.weight.data.normal_(0.0,0.1)


In [105]:
INPUT_DIM = 602
HIDDEN_DIM = 300
EMBED_DIM = 300
number_judges = 2099
model = Judge_emb_model(input_dim=INPUT_DIM,hidden_layer_dim=HIDDEN_DIM,
                        embedding_dim=EMBED_DIM,num_judges=number_judges)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=0.005)

In [106]:
N_EPOCH = 1
TRAIN_SIZE = train_dataset.data_tensor.shape[0]
print("Training data size",TRAIN_SIZE)
train_losses = []
val_losses = []

X_val_var = Variable(X_val)
y_val_var = Variable(y_val)
model.eval()
y_pred_val = model.forward(X_val_var)
val_loss = criterion(y_pred_val,y_val_var)
print("initial val loss",val_loss.data[0])
startTime = time.time()
model.train()

for i_epoch in range(N_EPOCH):
    epoch_train_loss = 0
    num_batches_per_epoch = int(TRAIN_SIZE/BATCH_SIZE)
    for i_batch,(X_batch, y_batch) in enumerate(train_loader):
        optimizer.zero_grad()
        
        X_var, y_var = Variable(X_batch),Variable(y_batch)
        
        y_pred = model.forward(X_var)
        loss = criterion(y_pred,y_var)
        loss.backward()
        
        optimizer.step()
#         if i_batch % 2000 == 0:
#             print(i_epoch,i_batch,loss.data[0])
        epoch_train_loss += loss.data[0]
        
    # after each epoch
    
    X_val_var = Variable(X_val)
    y_val_var = Variable(y_val)
    model.eval()
    y_pred_val = model.forward(X_val_var)
    val_loss = criterion(y_pred_val,y_val_var)
    ave_train_loss = epoch_train_loss/num_batches_per_epoch
    print("epoch",i_epoch,"ave_train_loss",
          ave_train_loss,"validation loss:",val_loss.data[0],time.time()-startTime)
    val_losses.append(val_loss.data[0])
    train_losses.append(ave_train_loss)
    model.train()

Training data size 235456
initial val loss 9.389286041259766
epoch 0 ave_train_loss 6.479643431223236 validation loss: 6.383147239685059 86.94050216674805


In [107]:
trained_emb = model.judge_embedding.weight.data.numpy()

In [108]:
trained_emb.shape

(2099, 300)

In [109]:
# trained_emb_path = os.path.join(finished_embedding_folder_path,"trained_emb_May1.pkl")
trained_emb_path = os.path.join(finished_embedding_folder_path,"centered_trained_emb_May13.pkl")
pickle.dump(trained_emb,open(trained_emb_path,"wb"))