# NLP Final Project

In [1]:
import json
import jieba
import numpy as np

In [2]:
from keras_preprocessing.text import Tokenizer, text_to_word_sequence
from keras_preprocessing.sequence import pad_sequences


In [3]:
jieba.setLogLevel("WARN")

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preparation

This section contains the process of data preparation for the project. The class object data transform holds all the process functions for the texts. Members of the classs is defined below:


1.   datapath :path to the files
2.   data : the data read from the files
3.   text cut: text after cutting
4.   tokenizer: tokenizer object for text2sequence
5.   label set : dictionary of labels
6.   extraction : data extracted from the original files
7.   fact pad sequence: fact sequence pad to the same length

A list of the functions is presented as the following:

1.   read data, read the data from raw files
2.   extract data, extract the information from data and stored in extractions
3. cut texts, perform a text cut for the facts using jieba
4. text2sequence, convert the texts to sequence
5. create label set,create the label set from raw files
6. create label, create one hot vectors for the labels
7. create label sets, create the one hot vectors label sets



In [None]:
class data_transform():

  def __init__(self):
    self.datapath = None
    self.data = None
    self.text_cut = None
    self.tokenizer = None
    self.label_set = {}
    self.extraction = {}
    self.fact_pad_seq = None
  def read_data(self, path = None):
    '''
    Function: read_data
    Parameters: path
    Return: None
    '''
    self.datapath = path
    f = open(path, 'r', encoding = 'utf-8')
    raw = f.readlines()
    data = []

    for d in raw:
      data.append(json.loads(d))
    '''
    for num,data_one in enumerate(raw):
      try:
        data.append(json.loads(data_one))
      except Exception as e:
        print('num : %d', '/n',
              'error: %s', '/n',
              'data: %s' %(num, e, data_one))
    '''
    self.data = data
  def extract_data(self, name = "fact"):
    '''
    Function: extract_data
    Parameters: name, the target information name
    Return:None
    '''
    data = self.data
    if name == "fact":
      extraction = list(map(lambda x : x["fact"],data))
    elif name in ["accusation", "relevent_articles"]:
      extraction = list(map(lambda x : x["meta"][name], data))
    elif name == "imprisonment":
      extraction = []

      for i in data:
        if i["meta"]["term_of_imprisonment"]["death_penalty"]:
          extraction.append([500])
        elif i["meta"]["term_of_imprisonment"]["life_imprisonment"]:
          extraction.append([400])
        else:
          extraction.append(i["meta"]["term_of_imprisonment"]["imprisonment"])
    self.extraction.update({name : extraction})

  def cut_texts(self,texts = None, need_cut = True, word_len =1):
    '''
    Function: cut_texts 
    Parameters: texts, texts for cutting
    need_cut, option for cut or not
    word_len, target word length after cutting
    Return: text cuts

    '''
      if need_cut:
        if word_len > 1:
          texts_cut = [[word for word in jieba.lcut(one_text) if len(word) >= word_len] for one_text in texts]
        else:
          texts_cut = [jieba.lcut(one_text) for one_text in texts]
      else:
        if word_len > 1:
          texts_cut = [[word for word in one_text if len(word) >= word_len] for one_text in texts]
        else:
          texts_cut = texts
      return texts_cut
  def text2seq(self, texts_cut = None, tokenizer_fact = None, num_words = 2000, maxlen = 30):
    '''
    Function text2seq
    Parameters: texts cut
    tokenizer_fact, tokenizer object
    num_words, number of words in tokenizer
    max len, max length for thr pad sequence
    Return: None
    '''
      texts_cut_len = len(texts_cut)
      if tokenizer_fact is None:
        tokenizer_fact = Tokenizer(num_words = num_words)
        if texts_cut_len > 10000:
          print("Too much text")
        n = 0
        while n < texts_cut_len:
          tokenizer_fact.fit_on_texts(texts = texts_cut[n:n + 10000])
          n+= 10000
          if n < texts_cut_len:
            print("tokenizer finish fit %d samples" % n)
          else:
            print("tokenizer finish fit %d samples" % texts_cut_len)
        self.tokenizer_fact = tokenizer_fact
        
        
      fact_seq = tokenizer_fact.texts_to_sequences(texts = texts_cut)
      print("finish texts to sequence")

      del texts_cut
      n = 0

      fact_pad_seq =[]

      while n < texts_cut_len:
        fact_pad_seq += list(pad_sequences(fact_seq[n: n + 10000],maxlen = maxlen, padding = "post", value =0, dtype = "int"))
        n += 10000

        if n < texts_cut_len:
          print("finish pad sequences %d samples" % n)
        else:
          print("finsh pad sequences %d samples" % texts_cut_len)
      self.fact_pad_seq = fact_pad_seq

  def create_label_set(self, name):
    '''
    Function: create label set
    Parameter: name, information name
    Return None
    '''
      if name == "accusation":
        name_f = "accu"
        with open("/content/drive/My Drive/Colab Notebooks/%s.txt" % name_f, encoding = "utf-8") as f:
          label_set = f.readlines()
      elif name == "relevant_articles":
        name_f = "law"
        with open("/content/drive/My Drive/Colab Notebooks/%s.txt" % name_f, encoding = "utf-8") as f:
          label_set = f.read_lines()
      else:
        label_set = [400,500] + list(range(1,25*12 +1))
      label_set = [i[:-1] for i in label_set]
      self.label_set.update({name : np.array(label_set)})
  def create_label(self,label,label_set):
    '''
    Function: create label
    Parameters: label,
    label set
    Return: zero one hot vector
    '''
      label_str = []
      for  i in label:
        label_str.append(str(i))
      label_zero = np.zeros(len(label_set))
      label_zero[np.in1d(label_set,label_str)] =1
      return label_zero
  def create_labels(self, label_set = None, labels = None, name= "accusation"):
    '''
    Function: create labels
    Parameters:
    label set, 
    labels, 
    name, name of the information
    Return: label sets in one hot vectors
    '''
      if label_set is None:
        label_set = self.label_set[name]
      if labels is None:
        labels = self.extraction[name]
      labels_one_hot = list(map(lambda x: self.create_label(label =x, label_set = label_set), labels))
      return labels_one_hot
        
      
        


Below is the process of prepare the facts and the label data from the raw files using the functions in data_transform. The fact data is save as fact_accusation.npy ans the labels data is saved as label_accusation.npy

In [None]:
data_trans = data_transform()
data_trans.read_data(path = "/content/drive/My Drive/Colab Notebooks/data_train.json")

In [None]:
data_trans.extract_data(name = "accusation")
accusations = data_trans.extraction["accusation"]
data_trans.create_label_set(name="accusation")
accu_labels = data_trans.create_labels(name = "accusation")
np.save("/content/drive/My Drive/Colab Notebooks/label_accusation.npy",accu_labels)

In [None]:
data_trans = data_transform()
data_trans.read_data(path = "/content/drive/My Drive/Colab Notebooks/data_train.json")
data_trans.extract_data(name = "fact")
facts = data_trans.extraction["fact"]
texts_cut = data_trans.cut_texts(texts = facts, word_len =1, need_cut = True)
data_trans.text2seq(texts_cut= texts_cut, tokenizer_fact = None, num_words = 40000, maxlen = 400)
fact_seq = data_trans.fact_pad_seq
np.save("/content/drive/My Drive/Colab Notebooks/fact_accu.npy",fact_seq)


Too much text
tokenizer finish fit 10000 samples
tokenizer finish fit 20000 samples
tokenizer finish fit 30000 samples
tokenizer finish fit 40000 samples
tokenizer finish fit 50000 samples
tokenizer finish fit 60000 samples
tokenizer finish fit 70000 samples
tokenizer finish fit 80000 samples
tokenizer finish fit 90000 samples
tokenizer finish fit 100000 samples
tokenizer finish fit 110000 samples
tokenizer finish fit 120000 samples
tokenizer finish fit 130000 samples
tokenizer finish fit 140000 samples
tokenizer finish fit 150000 samples
tokenizer finish fit 154592 samples
finish texts to sequence
finish pad sequences 10000 samples
finish pad sequences 20000 samples
finish pad sequences 30000 samples
finish pad sequences 40000 samples
finish pad sequences 50000 samples
finish pad sequences 60000 samples
finish pad sequences 70000 samples
finish pad sequences 80000 samples
finish pad sequences 90000 samples
finish pad sequences 100000 samples
finish pad sequences 110000 samples
finish 

# TextCNN Model

Below is the processing building the Text CNN model. The Text CNN model is a convolutional neural network for dealing with the texts. The information about the model is presented in the output.
The TextCNN model function contains two convolutiona
layers and a max pooling layer, the reason why they are put in a separate function is that there are three parts in the whole network with three different kernal sizes.

In [5]:
import time 
import pandas as pd
from keras.models import Model, load_model
from keras.layers import Input, Dense, Embedding, Dropout,BatchNormalization,Concatenate
from keras.layers import Conv1D,GlobalMaxPooling1D,MaxPooling1D, Bidirectional,GRU,Flatten,Activation
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam

In [6]:
num_words = 40000
maxlen = 400
filters = 256

In [7]:
facts = np.load("/content/drive/My Drive/Colab Notebooks/fact_accu.npy")
labels = np.load("/content/drive/My Drive/Colab Notebooks/label_accusation.npy")
fact_train,fact_test = train_test_split(facts,test_size = 0.5, random_state= 1)
labels_train,labels_test = train_test_split(labels, test_size = 0.5, random_state =1)

del facts
del labels


In [None]:
def TextCNN_model(data_input = None, kernel_size =1, filters = 256):
  '''
  Function: TextCNN Model
  Parameters: data_input, input embedding layer of the data
  kernal size,
  filters
  Return: the Text CNN model
  '''
  embed = data_input

  cnn1 = Conv1D(filters,kernel_size = [kernel_size], strides =1,padding="same")(embed)
  cnn1 = BatchNormalization()(cnn1)
  cnn1 = Activation(activation ="relu")(cnn1)
  
  cnn1 = Conv1D(filters, kernel_size = [kernel_size], strides =1, padding="same")(cnn1)
  cnn1 = BatchNormalization()(cnn1)
  cnn1 = Activation(activation="relu")(cnn1)

  cnn1 = GlobalMaxPooling1D()(cnn1)
  return cnn1


In [None]:
#Building the entire neural network

input_layer = Input(shape = fact_train.shape[1])
embedder = Embedding(input_dim = num_words +1,
                     input_length = maxlen,
                     output_dim = 512,
                     mask_zero  = 0,
                     name = "Embedding")(input_layer)
cnn1 = TextCNN_model(embedder,1,filters)
cnn2 = TextCNN_model(embedder,2,filters)
cnn3 = TextCNN_model(embedder,3,filters)

cnn = Concatenate(axis=1)([cnn1,cnn2,cnn3])
cnn = BatchNormalization()(cnn)
cnn = Dense(500,activation="relu")(cnn)
cnn = Dense(202,activation = "sigmoid")(cnn)

opt = Adam(0.01)
model = Model(inputs= input_layer, outputs = cnn)
model.compile(loss ="binary_crossentropy", optimizer= opt, metrics = ["accuracy"])
model.summary()

print("Training Starts")
model.fit(x = fact_train, y = labels_train, batch_size = 512, epochs =5, verbose =1)


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 400)]        0                                            
__________________________________________________________________________________________________
Embedding (Embedding)           (None, 400, 512)     20480512    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 400, 256)     131328      Embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 400, 256)     262400      Embedding[0][0]                  
______________________________________________________________________________________________

<keras.callbacks.History at 0x7ff352c27a10>

In [16]:
model.save("/content/drive/My Drive/Colab Notebooks/model.h5")

NameError: ignored

# Evaluations

This section contains the evaluation methods of the model predicting the test set.

In [None]:
from sklearn.metrics.scorer import f1_score

In [None]:
def f1_avg(y_pred, y_true):
  '''
  Function: Computes the average of f1 score of micr and macro
  Parameters: y_pred, predicted y 
  y_true, the true label
  Return: averaged f1 score
  '''
  f1_m = f1_score(y_pred= y_pred, y_true= y_true, pos_label=1,average="micro",zero_division= 0 )
  f1_m1 = f1_score(y_pred = y_pred, y_true= y_true, pos_label =1,average ="macro",zero_division= 0)
  return (f1_m + f1_m1)/2



In [None]:
def predict2(predictions):
  '''
  Function predict2,get the predictions above 0.5
  Parameter: predictions
  Return: list of one hot vectors of predictions above 0.5
  '''
  one_hots = []
  for prediction in predictions:
    one_hot = np.where(prediction > 0.5 ,1.0, 0.0)
    one_hots.append(one_hot)
  return np.array(one_hots)


Below are the evaluation results of the model.

In [None]:
best_model = load_model("/content/drive/My Drive/Colab Notebooks/model.h5")

In [None]:
y_pred = best_model.predict(fact_test[:])
print(y_pred)
y1 = predict2(y_pred)
print(y1)

In [None]:
print(labels_test)

In [None]:
s1 = [(labels_test[i]== y1[i]).min() for i in range(len(y1))]
print(sum(s1)/ len(s1))

In [None]:
s2 =f1_avg(y_pred = y1, y_true = labels_test)
print(s2)


In [None]:
s3 = distance_score(y_true= labels_test, y_pred= y1)
print(s3)