**Author: Binqian Zeng**

In [1]:
import numpy as np
import pandas as pd
import re
import gensim
from collections import namedtuple
import time

# Train model to extract "ModelOfConviction" feature for each text. 


### Preparing Dataset

In [2]:
def _clean_text(data):
    """
    Clean text data. Remove '\n' and '\t'
    @data: text data read from .txt files
    """
    data = ' '.join(data.copy())
    data = data.replace('\n', '')
    data = data.replace('\t', '')
    return data 

def _read_text(txtpath, txtfilename):
    """
    read .txt files by filenames with valid labeled ModeOfConviction 
    @txtpath: directory path of .txt files
    @txtfilename: name of the .txt file
    """
    filepathname = txtpath + txtfilename
    # Using the newer with construct to close the file automatically.
    with open(filepathname) as f:
        data = f.readlines()
        f.close()
    return data  #return text from .txt file

def _text_preprocessing(df):
    """
    Data preprocessing:
    **1. Remove special characters like "[]" to avoid unnecessary nosies** <br/>
    **2. Remove words of labels "Nonjury trial', 'plea of guilty' and 'jury verdict'**<br/>
    **Note:** The reason of removing words of labels is to avoid being affected directly by words of labels. 
    Most of of unlabled data do not directly contain words of labels like 'nonjury trial' 'plea of guilty' or 
    'jury verdict'. 
    
    """
    df = df[df['Text'].notnull()].copy()
    df['Text'] = df['Text'].map(lambda x: x.replace('[','').replace(']','').replace("(",'').replace(')',''))
    df['Text'] = df['Text'].map(lambda x: re.sub(" +", ' ', x))
    #remove label words
    df['Text'] = df['Text'].map(lambda x: x.replace('nonjury trial','').replace('plea of guilty','').replace('jury verdict',''))
    return df

In [3]:
# Read CSV
csv_2017 = pd.read_csv("/Users/EricTseng/Desktop/PoliticalLab0211/ParseResult/2017.csv", sep=',', encoding='latin-1')
csv_2017.head()

#Extract filenames with valid ModeOfConviction
oridata = csv_2017[csv_2017['ModeOfConviction'].notnull()].copy()

#Extract the pair of filename and ModeOfConviction
"""
@FCT_data: the data frame contains filenames, corresponding ModeOfConviction and corresponding text
"""
FCT_data = pd.DataFrame(columns=['File', 'Text', 'ModeOfConviction'])
FCT_data.File = oridata['File']
FCT_data.ModeOfConviction = oridata['ModeOfConviction']

#initialize the directory path of .txt files
txtpath = "/Users/EricTseng/Desktop/PoliticalLab0211/NY-Appellate-Scraping/2017-09-10/courtdoc/txt/"
txtfilename_ls = list(FCT_data['File'])

#Open text files by list of names and put text into FCT_data dataframe for preparing dataset
Text_ls = []
for filename in txtfilename_ls:
    oritxt = _read_text(txtpath, filename)
    texttxt = _clean_text(oritxt)
    Text_ls.append(texttxt)

FCT_data['Text'] = Text_ls # Store list of txt text into FCT_data

FCT_data.reset_index(drop=True, inplace=True) #reset index

display(FCT_data.head())

print("Unique labels of Mode Of Conviction {}".format(FCT_data['ModeOfConviction'].unique()))


Unnamed: 0,File,Text,ModeOfConviction
0,2017_00030.htm.txt,People v Suazo 2017 NY Slip Op 00030 [146 AD3...,nonjury trial
1,2017_00033.htm.txt,People v Crawford 2017 NY Slip Op 00033 [146 ...,plea of guilty
2,2017_00046.htm.txt,People v Ryder 2017 NY Slip Op 00046 [146 AD3...,nonjury trial
3,2017_00077.htm.txt,People v Saunders 2017 NY Slip Op 00077 [146 ...,plea of guilty
4,2017_00134.htm.txt,People v Miller 2017 NY Slip Op 00134 [146 AD...,plea of guilty


Unique labels of Mode Of Conviction ['nonjury trial' 'plea of guilty' 'jury verdict']


### Data Preprocessing

**1. Remove special characters like "[]" to avoid unnecessary nosies** <br/>
**2. Remove words of labels "Nonjury trial', 'plea of guilty' and 'jury verdict'**<br/>
**Note:** The reason of removing words of labels is to avoid being affected directly by words of labels. Most of of unlabled data do not directly contain words of labels like 'nonjury trial' 'plea of guilty' or 'jury verdict'.<br\>
**3. Digitizing labels, 'nonjury trial'->1; 'plea of guilty'->2; 'jury verdict'->3**

In [4]:
FCT_data = _text_preprocessing(FCT_data) #preprocessing text
FCT_data.loc[FCT_data['ModeOfConviction'] == 'nonjury trial', 'Label'] = '1'
FCT_data.loc[FCT_data['ModeOfConviction'] == 'plea of guilty', 'Label'] = '2'
FCT_data.loc[FCT_data['ModeOfConviction'] == 'jury verdict', 'Label'] = '3'

display(FCT_data.head())

Unnamed: 0,File,Text,ModeOfConviction,Label
0,2017_00030.htm.txt,People v Suazo 2017 NY Slip Op 00030 146 AD3d...,nonjury trial,1
1,2017_00033.htm.txt,People v Crawford 2017 NY Slip Op 00033 146 A...,plea of guilty,2
2,2017_00046.htm.txt,People v Ryder 2017 NY Slip Op 00046 146 AD3d...,nonjury trial,1
3,2017_00077.htm.txt,People v Saunders 2017 NY Slip Op 00077 146 A...,plea of guilty,2
4,2017_00134.htm.txt,People v Miller 2017 NY Slip Op 00134 146 AD3...,plea of guilty,2


### Doc2Vec

Using the Doc2Vec library in Gensim to represent text for future use

In [8]:
#Time it
startTime = time.time()

rs_ls = []
#Load Data 
for (text, label, i) in zip(FCT_data['Text'], FCT_data['ModeOfConviction'], range(0,FCT_data.shape[0])):
    Sent_ls = text.lower().split(".")
    docs = []
    analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
    
    #Transform data (you can add more data preprocessing steps) 
    for Sent in Sent_ls:
        words = Sent.split()
        label = label
        docs.append(analyzedDocument(words, label))
        
    # Train model (set min_count = 1, if you want the model to work with the provided example data set)
    doc2vec_md = gensim.models.doc2vec.Doc2Vec(docs, size = 100, window = 300, min_count = 1, workers = 4)
    
    #rs_ls.append(doc2vec_md.docvecs[0])
    rs_ls.append(doc2vec_md.docvecs[0])
FCT_data['DocVec'] = rs_ls

print("Time for Doc2Vec cost: ",time.time() - startTime)


Time for Doc2Vec cost:  23.30015802383423


In [9]:
display(FCT_data.head())

Unnamed: 0,File,Text,ModeOfConviction,Label,DocVec
0,2017_00030.htm.txt,People v Suazo 2017 NY Slip Op 00030 146 AD3d...,nonjury trial,1,"[-0.100383, 0.0640729, -0.0230143, 0.0469773, ..."
1,2017_00033.htm.txt,People v Crawford 2017 NY Slip Op 00033 146 A...,plea of guilty,2,"[-0.00689806, -0.00291594, -0.00776605, 0.0049..."
2,2017_00046.htm.txt,People v Ryder 2017 NY Slip Op 00046 146 AD3d...,nonjury trial,1,"[-2.93095, 0.914522, -1.05463, 1.06198, -1.152..."
3,2017_00077.htm.txt,People v Saunders 2017 NY Slip Op 00077 146 A...,plea of guilty,2,"[-0.00788608, -0.00238305, -0.00857779, 0.0060..."
4,2017_00134.htm.txt,People v Miller 2017 NY Slip Op 00134 146 AD3...,plea of guilty,2,"[-0.0679377, 0.0224821, -0.0638266, 0.046354, ..."


### Model Approach

In [10]:
from sklearn.cross_validation import train_test_split



In [11]:
X = np.array(FCT_data['DocVec'])
y = np.array(FCT_data['Label'])
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train =  np.array([list(e) for e in X_train])
X_test = np.array([list(e) for e in X_test])
#y_train = np.array(list(e) for e in y_train)
#y_test = np.array(list(e) for e in y_test)

#### K-neighbours

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


In [26]:
# instantiate learning model (k = 3)
knn = KNeighborsClassifier(n_neighbors=5)
# fitting the model
knn.fit(X_train, y_train)
# predict the response
pred = knn.predict(X_test)

# evaluate accuracy
print ("The accuracy for the K-neighbours model is : {}".format(accuracy_score(y_test, pred)))

The accuracy for the K-neighbours model is : 1.0


**Definitly some bug here or this approach is not appropriate. No model's accuracy can reach 100%. However, not enough time to find where mistakes are.**

The K-nearest neighbours approach is based on the thoughts that similar documents should be close to each other. 

### Future Work 

**Thoughts from different perspectives are described below:**

**Representation:**<br/>
1. PMI(Pointwise mutual information) can be used as good representation of words.
2. GloVe can be used as good representation of words. 

**Model:**<br/>
1. Neural Network with attention mechanism and RNN are helpful for understanding the text
2. Classification with appropriate kernel function will be helpful to capture high-dimentioanal features.

