# Notebook 1: Preprocessing of text

This notebook presents the preprocessing of the document texts as explained in the thesis section 3.2.

Table of Contents:
* [1.1 Load documents](#load)
* [1.2 Preprocessing of document contents](#preprocessing)
* [1.3 Export preprocessed text](#export)

In [None]:
# loading modules

from IPython.core.display import display, HTML
import pandas as pd
pd.set_option('display.max_rows', 600)
pd.set_option('display.height', 1000)
pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_columns', 200)
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from tqdm import tqdm_notebook
import seaborn as sns
from collections import Counter
from fuzzywuzzy import fuzz, process
from operator import itemgetter
import string
import nltk
from nltk.corpus import stopwords, words, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import WordNetLemmatizer, pos_tag, word_tokenize
from nltk.corpus.reader.wordnet import WordNetError
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn import preprocessing
import pickle
import spacy
from copy import deepcopy

In [None]:
#define functions for saving and loading objects
def save_pickle(objectname, picklename): # e.g. save_pickle(contracts_labeled, 'Pickles/contracts_labeled.pickle')
    pickle_out = open(picklename,"wb") #.pickle
    pickle.dump(objectname, pickle_out)
    pickle_out.close()
    print(picklename, 'successfully pickled.')

def load_pickle(picklename): # e.g. contracts_labeled = load_pickle('Pickles/contracts_labeled.pickle')
    pickle_in = open(picklename,"rb")
    return pickle.load(pickle_in)

# 1.1 Load documents <a id="load"></a>

In [None]:
#load contracts_labeled
#load contracts_unlabeled

# 1.2 Preprocess training and test data <a id="preprocessing"></a>

In [None]:
# Preprocessing to return the cleaned text as a list of words as string

lem = WordNetLemmatizer() #load lemmatizer
eng_vocab = set(words.words()) # load Englisch vocabulary
nlp = spacy.load("de_core_news_sm") # load German vocabulary
ger_vocab = [x.lower() for x in nlp.vocab.strings] # format German vocabulary

def lemmatize(word): #function to lemmatize a word
    pos_label = (pos_tag(word_tokenize(word))[0][1][0]).lower() #identify single character pos constant from pos_tag
    
    if pos_label == 'j': pos_label = 'a'    # 'j' <-> 'a' reassignment for adjectives because 'j' is not in wordnet: 'a' as label for adjectives
    
    if pos_label in ['r']:  # identify and lemmatize adverbs 
        try:
            return wordnet.synset(word+'.r.1').lemmas()[0].pertainyms()[0].name()
        except (IndexError, WordNetError):
            return word
    elif pos_label in ['a', 's', 'v']: # identify and lemmatize (satellite) adjectives and verbs
        return lem.lemmatize(word, pos=pos_label)
    else:   # lemmatize nouns and everything else
        return lem.lemmatize(word)
        
def text_process(text):
    nopunc = [char for char in text if char not in string.punctuation] #remove punctuation
    nopunc = ''.join([i for i in nopunc if not i.isdigit()]) #remove digits
    nopunc =  [word.lower() for word in nopunc.split() if (word not in stopwords.words('english') and word not in stopwords.words('german'))] #remove stopwords
    new_text = [lemmatize(word) for word in nopunc] #lemmatize words
    new_text = [i for i in new_text if (i in eng_vocab or i in ger_vocab) and len(i) > 1] #keep only real words and remove single character "words"
    return new_text

In [None]:
# Preprocessing of texts
contracts_labeled['TEXT_PROCESSED'] = "" #create new column for preprocessed text
contracts_labeled['TEXT_PROCESSED'] = [' '.join(text_process(str(text))) for text in tqdm_notebook(contracts_labeled['TEXT'], "Text Processing Labeled")] #preprocess and join text

contracts_unlabeled['TEXT_PROCESSED'] = "" #create new column for preprocessed text
contracts_unlabeled['TEXT_PROCESSED'] = [' '.join(text_process(str(text))) for text in tqdm_notebook(contracts_unlabeled['TEXT'], "Text Processing Unlabeled")] #preprocess and join text

# 1.3 Export preprocessed text <a id="export"></a>

In [None]:
# Create labeled and unlabeled DataFrames with preprocessed text
data_labeled_X = pd.DataFrame(contracts_labeled[['TEXT_PROCESSED']]).reset_index(drop=True)
data_unlabeled_X = pd.DataFrame(contracts_unlabeled[['TEXT_PROCESSED']]).reset_index(drop=True)

# convert categorical document classes into numerical labels
one_hot_encoder = preprocessing.LabelEncoder().fit(['Agreement', 'Amendment', 'Attachment', 'LOI', 'NDA', 'Offer', 'SOW'])
data_labeled_y = one_hot_encoder.transform(contracts_labeled['CON_TYPE'])

In [None]:
# export datasets as pickles
save_pickle(data_labeled_X, 'Pickles/1_labeled_X.pickle')
save_pickle(data_unlabeled_X, 'Pickles/1_unlabeled_X.pickle')
save_pickle(data_labeled_y, 'Pickles/1_labeled_y.pickle')