# Predicting with model

## Text transformation

In [10]:
# from tqdm import tqdm
# import os
# import json
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
# import tensorflow_datasets as tfds
import tensorflow_text as text  # A dependency of the preprocessing model
import tensorflow_addons as tfa
# from official.nlp import optimization
import numpy as np
import ast

 # Import necessary libraries
# import matplotlib.pyplot as plt
# import seaborn as sns
import warnings
import nltk, time
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
# Import Dictionary
from gensim.corpora.dictionary import Dictionary
from keras.utils.np_utils import to_categorical
import collections, itertools

In [74]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.preprocessing import (LabelBinarizer, OrdinalEncoder,LabelEncoder,MinMaxScaler)

In [28]:
data = pd.read_csv('../../../../datasets/sherloc/sherloc_court_cases_7.csv')
data.head()

Unnamed: 0,text,crime_types,sentence
0,Mr. Solomon Sauls ran an illegal enterprise wi...,money laundry,1
1,SummaryHarmony Gold Mine (Pty) Limited is a mi...,money laundry,1
2,SummaryThe three defendants were found guilty ...,money laundry,1
3,Johannes Erasmus van Staden was a Cape Town bu...,money laundry,1
4,Juan Hattingh was a young practicing attorney ...,money laundry,1


In [16]:
# Function to tokenize the tweets
def custom_tokenize(text):
    """Function that tokenizes text"""
    from nltk.tokenize import word_tokenize
    if not text:
        print('The text to be tokenized is a None type. Defaulting to blank string.')
        text = ''
    return word_tokenize(text)

In [17]:
def clean_up(data):
    """Function that cleans up the data into a shape that can be further used for modeling"""
    # english = data[data['lang']=='en'] # extract only tweets in english language
    data.drop_duplicates() # drop duplicate tweets
    data['text'].dropna(inplace=True) # drop any rows with missing tweets
    tokenized = data['text'].apply(custom_tokenize) # Tokenize tweets
    lower_tokens = tokenized.apply(lambda x: [t.lower() for t in x]) # Convert tokens into lower case
    alpha_only = lower_tokens.apply(lambda x: [t for t in x if t.isalpha()]) # Remove punctuations
    no_stops = alpha_only.apply(lambda x: [t for t in x if t not in stopwords.words('english')]) # remove stop words
    # no_stops.apply(lambda x: [x.remove(t) for t in x if t=='rt']) # remove acronym "rt"
    return no_stops

In [18]:
cleaned_text_chunk = clean_up(data)

In [19]:
print(cleaned_text_chunk.shape)
print(cleaned_text_chunk.str.len().max())
print(cleaned_text_chunk.str.len())

(1250,)

In [22]:
# Create a Dictionary from the tweets
dictionary = Dictionary(cleaned_text_chunk)

In [23]:
# Create corpus for bag of words (token IDs of each word with their frequencies)
corpus = cleaned_text_chunk.apply(lambda x: dictionary.doc2bow(x))

In [24]:
corpus

0       [(0, 8), (1, 4), (2, 1), (3, 1), (4, 1), (5, 1...
1       [(3, 2), (6, 2), (12, 1), (15, 1), (32, 3), (3...
2       [(0, 6), (1, 1), (2, 1), (3, 1), (7, 4), (23, ...
3       [(1, 1), (12, 2), (15, 2), (23, 1), (29, 2), (...
4       [(15, 2), (23, 1), (31, 1), (44, 2), (51, 3), ...
                              ...                        
1245    [(1, 3), (26, 2), (43, 1), (58, 1), (70, 1), (...
1246    [(235, 1), (346, 1), (791, 1), (1521, 1), (176...
1247    [(1, 1), (26, 1), (48, 1), (58, 1), (70, 1), (...
1248    [(58, 1), (124, 1), (143, 1), (195, 1), (252, ...
1249    [(1, 1), (26, 3), (27, 1), (37, 3), (43, 2), (...
Name: text, Length: 1250, dtype: object

In [30]:
data.rename(columns={'text':'raw_text'}, inplace=True)
data.head(2)

Unnamed: 0,raw_text,crime_types,sentence
0,Mr. Solomon Sauls ran an illegal enterprise wi...,money laundry,1
1,SummaryHarmony Gold Mine (Pty) Limited is a mi...,money laundry,1


In [31]:
df = pd.concat([data,cleaned_text_chunk],axis=1)
df.head(2)

Unnamed: 0,raw_text,crime_types,sentence,text
0,Mr. Solomon Sauls ran an illegal enterprise wi...,money laundry,1,"[solomon, sauls, ran, illegal, enterprise, pur..."
1,SummaryHarmony Gold Mine (Pty) Limited is a mi...,money laundry,1,"[summaryharmony, gold, mine, pty, limited, min..."


In [32]:
df.rename(columns={'text':'tokenized_cleaned_text'}, inplace=True)
df.head(2)


Unnamed: 0,raw_text,crime_types,sentence,tokenized_cleaned_text
0,Mr. Solomon Sauls ran an illegal enterprise wi...,money laundry,1,"[solomon, sauls, ran, illegal, enterprise, pur..."
1,SummaryHarmony Gold Mine (Pty) Limited is a mi...,money laundry,1,"[summaryharmony, gold, mine, pty, limited, min..."


In [26]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [33]:
# Lemmatize tokens
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['tokenized_cleaned_text'].apply(lambda x: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in x])

29.560123682022095

In [34]:
df.head(2)

Unnamed: 0,raw_text,crime_types,sentence,tokenized_cleaned_text,lemmatized
0,Mr. Solomon Sauls ran an illegal enterprise wi...,money laundry,1,"[solomon, sauls, ran, illegal, enterprise, pur...","[solomon, saul, ran, illegal, enterprise, purp..."
1,SummaryHarmony Gold Mine (Pty) Limited is a mi...,money laundry,1,"[summaryharmony, gold, mine, pty, limited, min...","[summaryharmony, gold, mine, pty, limited, min..."


In [45]:
# lemmatized text back to tokenized
# m= ast.literal_eval(f) # convert string to list just like json.loads()
# df['tokens_back_to_text'] = [' '.join(map(str, ast.literal_eval(l))) for l in df['lemmatized']]
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmatized']]
df.head(2)

Unnamed: 0,raw_text,crime_types,sentence,tokenized_cleaned_text,lemmatized,tokens_back_to_text
0,Mr. Solomon Sauls ran an illegal enterprise wi...,money laundry,1,"[solomon, sauls, ran, illegal, enterprise, pur...","[solomon, saul, ran, illegal, enterprise, purp...",solomon saul ran illegal enterprise purpose po...
1,SummaryHarmony Gold Mine (Pty) Limited is a mi...,money laundry,1,"[summaryharmony, gold, mine, pty, limited, min...","[summaryharmony, gold, mine, pty, limited, min...",summaryharmony gold mine pty limited mining co...


## model data

In [76]:

# class for Y convertions
class TextLabelEncoderDummy:

    def labelencoder(y_df):
        encoder = LabelEncoder()
        encoder.fit(y_df)
        encoded_Y = encoder.transform(y_df)
        return encoded_Y, encoder


    def encoded_to_dummy(encoded_Y):
        # convert encoder variable to dummy variable
        uniques, ids = np.unique(encoded_Y, return_inverse=True)
        dummy_y = to_categorical(ids, len(uniques))
        # dummy_y = np_utils.to_categorical(encoded_Y)
        return dummy_y, uniques


    def reverse_dummy_to_encoded(y_test):
        reverse_dummy = uniques[y_test.argmax(1)]
        return reverse_dummy


    def reverse_encoded_to_text(reverse_dummy):
        reverse_encoded = encoder.inverse_transform(reverse_dummy)
        return reverse_encoded

### categorized y

In [88]:
# y = df.sentence.values
y = [1,0]

In [89]:
# One-hot encoding of labels
encoded_Y, encoder = TextLabelEncoderDummy.labelencoder(y)
dummy_y, uniques = TextLabelEncoderDummy.encoded_to_dummy(encoded_Y)

In [90]:
encoded_Y, uniques, dummy_y

(array([1, 0]),
 array([0, 1]),
 array([[0., 1.],
        [1., 0.]], dtype=float32))

In [92]:
# X = df.tokens_back_to_text.values[0]
X = ['''solomon saul ran illegal enterprise purpose poach sell abalone haliotis midae south africa group\ 
abalone diver work supply illegally harvest abalone accuse bribed official department agriculture forestry \
fishery prevent confiscate abalone buy back abalone already seize authority corrupt official face charge separate \
march police search accuse house encounter cash accuse confess proceeds illegal activity february defendant plead guilty \
count involve run illegal enterprise corruption money laundering possess transport illegally harvest aggravate sentence \
seriousness corrupt government official engagement illegal abalone trade commercial scale financial accuse involve similar \
crime past previously receive prison sentence involvement another illegal enterprise focus abalone found responsible run illegal \
abalone business different area hand current trial solomon saul sentence year imprisonment run concurrently result effective sentence \
year determine run concurrently previous sentence year''']

In [59]:
# prepare tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

In [93]:
X

['solomon saul ran illegal enterprise purpose poach sell abalone haliotis midae south africa group\\ \nabalone diver work supply illegally harvest abalone accuse bribed official department agriculture forestry fishery prevent confiscate abalone buy back abalone already seize authority corrupt official face charge separate march police search accuse house encounter cash accuse confess proceeds illegal activity february defendant plead guilty count involve run illegal enterprise corruption money laundering possess transport illegally harvest aggravate sentence seriousness corrupt government official engagement illegal abalone trade commercial scale financial accuse involve similar crime past previously receive prison sentence involvement another illegal enterprise focus abalone found responsible run illegal abalone business different area hand current trial solomon saul sentence year imprisonment run concurrently result effective sentence year determine run concurrently previous sentence

In [61]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 23 unique tokens.


In [62]:
# integer encode the documents
sequences = tokenizer.texts_to_sequences(X)

In [65]:
# pad documents to a max length of 14 words
maxlen = 25
X = pad_sequences(sequences, maxlen=maxlen)

In [66]:
X.shape

(999, 25)

In [67]:
X

array([[0, 0, 0, ..., 0, 0, 6],
       [0, 0, 0, ..., 0, 0, 8],
       [0, 0, 0, ..., 0, 0, 5],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 3]], dtype=int32)

In [68]:
from keras.models import load_model
lstm = load_model('lstm.h5')
warnings.filterwarnings("ignore")

2023-05-10 10:11:55.083303: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-10 10:11:55.084939: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-10 10:11:55.086165: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [48]:
lstm.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 25, 32)            360352    
                                                                 
 lstm_6 (LSTM)               (None, 25, 20)            4240      
                                                                 
 dropout_6 (Dropout)         (None, 25, 20)            0         
                                                                 
 lstm_7 (LSTM)               (None, 25, 10)            1240      
                                                                 
 dropout_7 (Dropout)         (None, 25, 10)            0         
                                                                 
 lstm_8 (LSTM)               (None, 5)                 320       
                                                                 
 dense_3 (Dense)             (None, 2)                

In [38]:
# list(itertools.chain.from_iterable(X_test))
Counter(list(itertools.chain.from_iterable(y_test)))

NameError: name 'y_test' is not defined

In [82]:
predictions = lstm.predict(X)

2023-05-10 11:08:19.793180: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-10 11:08:19.794912: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-10 11:08:19.796134: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



In [83]:
reverse_dummy_predicted =TextLabelEncoderDummy.reverse_dummy_to_encoded(predictions)
print(pd.unique(reverse_dummy_predicted).shape)

# There will be no need for this
# reverse_encoded_y_predicted = TextLabelEncoderDummy.reverse_encoded_to_text(reverse_dummy_predicted)
# reverse_encoded_y_predicted

(1,)


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

### ----End-----

In [84]:
# len(reverse_encoded_y_predicted)
Counter(list(itertools.chain(reverse_encoded_y_predicted)))

Counter({1: 999})

In [81]:
len(predictions)

999