In [None]:
#!/usr/bin/my-virtualenv-name python3
import os
import pandas as pd
import re
import nltk
from pathlib import Path
import json
import subprocess
import sys
import pickle
import csv
from pandas import DataFrame

from sklearn.utils import shuffle

from pre_processing_functions import *

## Paths

In [None]:
#Different paths for data
base = os.getcwd().split('Master-Thesis')[0].replace('\\', '/')

#Location of the pdf files
path_aktes = base + 'aktes'

#Location of where the annotations are saved to annotations.csv. This is set in pdfannots.py under save_filename
path_all_annotations = 'annotations.csv'

#Name of the excel sheet with annotations from the system
path_annotations_excel = 'notation_list.xlsx'

## Processing highlights

In [None]:
'''Reads all pdf's and extracts highlights and saves them to annotations.csv'''

#Saves all filenames and failed files
all_filenames = []
all_fails = []

for filename in os.listdir(path_aktes):
    if filename.endswith(".pdf"):
        os.rename(path_aktes + '/' + filename, path_aktes + '/' +filename.replace(' ','_'))
        filename = filename.replace(' ','_')
        all_filenames.append(filename)
        try:
            subprocess.check_output(["python", "pdfannots.py", path_aktes + '/' + filename]).decode(sys.stdout.encoding)
        except:
            all_fails.append(filename)
            continue

In [None]:
'''Reads highlights from csv file and sets column names'''

df_anno = pd.read_csv(path_all_annotations, encoding='latin-1')
df_anno.columns = ['page', 'text_anno', 'file']

In [None]:
'''Perform regex on the filenames so that essential information is extracted and the data 
from excel can be linked to the individual pdf files.'''

df_anno['filename'] = df_anno.apply(lambda row: row['file'].replace(' ','_'), axis = 1)
df_anno['date'] = df_anno['file'].apply(extract_year)
df_anno['text_tokenized_highlight'] = df_anno.apply(lambda row: stemmer(remove_stopwords(process_input\
                                (BeautifulSoup(row['text_anno'], 'html.parser').get_text()))), axis = 1)

In [None]:
'''Save annotations to pickle'''

df_anno.to_pickle('complete_annotations.pickle')

## Process text from files

In [None]:
#Process the documents using the pdfminer package and saves them in dataframe
df_pdfminer = process_docs_and_label(df_anno, path_aktes, list(set(list(df_anno.file))))

## Create dataframe with pages 

In [None]:
df_pdfminer = df_pdfminer.reset_index()[df_pdfminer.columns]

#Duplicate files in system are removed
df_pdfminer = df_pdfminer[df_pdfminer['filename'].str.contains('.pdf_')==False] 

#Only overlap with annotations
df_overlap = df_pdfminer[df_pdfminer['filename'].isin(list(df_anno.file))] 

#All relevant pages first
df_overlap = df_overlap.sort_values('relevant',ascending = False) 

#Make string of tokenized text to find duplicates
df_overlap['text_string'] = df_overlap.apply(lambda row: ' '.join(row['text_tokenized']), axis = 1)

#Drop all false and duplicates
df_overlap = df_overlap.drop_duplicates(subset=['text_string'], keep='first') 

#Lower the filenames for both dataframes
df_overlap['filename'] = df_overlap['filename'].str.lower().str.strip()
df_anno['filename'] = df_anno['filename'].str.lower().str.strip()

#Shuffle the dataframe
df_overlap = df_overlap.sample(frac=1).reset_index(drop=True)

## Created merged dataframe for individual pages and highlights

In [None]:
#Check if the type of data is correct
df_overlap['page']=df_overlap['page'].astype(int)
df_anno['page']=df_anno['page'].astype(int)
#Merge the individual highlights with the pages
df_merge_col = pd.merge(df_overlap, df_anno, on=['page','filename'], how= 'left')
#Drop duplicates
df_merge_col= df_merge_col.drop_duplicates(subset=['filename', 'page', 'text_anno', 'text_normal'])
df_merge_col['index'] = df_merge_col.index
#Select the relevant columns
df_merge_col = df_merge_col[['filename', 'text_tokenized', 'text_normal', 'page', 'relevant', 'text_anno', 'text_string']]

In [None]:
#Create year column from filename
df_merge_col['year'] = df_merge_col['filename'].apply(extract_year).str[:4]
df_merge_col['unique_words'] = df_merge_col.apply(lambda r: len(set(r['text_tokenized'])), axis = 1)
df_merge_col['uitgifte'] = df_merge_col.apply(lambda r: 1 if 'uitgifte' in r['filename'].lower() else 0, axis = 1)
df_merge_col['splitsing'] = df_merge_col.apply(lambda r: 1 if 'splitsing' in r['filename'].lower() else 0, axis = 1)
df_merge_col['levering'] = df_merge_col.apply(lambda r: 1 if 'levering' in r['filename'].lower() else 0, axis = 1)

## Filter on document types and check highlights

In [None]:
#Remove certain document types based on the filename
df_merge_col = df_merge_col[~df_merge_col['filename'].str.contains('conversie')]
df_merge_col = df_merge_col[~df_merge_col['filename'].str.contains('rectificatie')]
df_merge_col = df_merge_col[~df_merge_col['filename'].str.contains('besluit')]
df_merge_col = df_merge_col[~df_merge_col['filename'].str.contains('wijziging')]
df_merge_col = df_merge_col[~df_merge_col['filename'].str.contains('samenvoeging')]

In [None]:
#Check if highlight is unique on page and relates to zoning plan
df_merge_col['check_relevant'] = df_merge_col.apply(remove_inconsistencies,axis=1)

In [None]:
#Sort all pages based on whether the highlight is related to zoning plan
df_merge_col = df_merge_col.sort_values(by = 'check_relevant', ascending = True)
#Drop the duplicate pages and when possibl keep the page that is labelled as True 
df_merge_col = df_merge_col.drop_duplicates(subset=['filename', 'page'], keep = 'last')
#Drop duplicate pages that have exactly the same content (even though filename is different)
df_merge_col = df_merge_col.drop_duplicates(subset=['text_string'], keep = 'last')

df_merge_col = df_merge_col.sample(frac=1).reset_index(drop=True)

In [None]:
#Remove empty strings
df_merge_col['text_tokenized'] = df_merge_col.apply(lambda row: list(filter(None, row['text_tokenized'])), axis = 1)

#Remove words with one character or less
df_merge_col['text_tokenized'] = df_merge_col.apply(lambda row: [word  for word in row['text_tokenized']if (len(word) > 1)] , axis = 1)

In [None]:
#Create different texts for the different vectorization methods
df_merge_col['text_tokenized_joined'] =  df_merge_col.apply(lambda row: ' '.join(row['text_tokenized']), axis = 1)

df_merge_col['tokenized_unstemmed_unstopwords_all'] = df_merge_col.apply(lambda row: remove_names(process_input\
                               (BeautifulSoup(row['text_normal'], 'html.parser').get_text())), axis = 1)

df_merge_col['text_ngrams'] = df_merge_col.apply(lambda row: ' '.join(row['tokenized_unstemmed_unstopwords_all']), axis =1)

In [None]:
#Select the pages of certain document types and shuffle the dataframe randomly
df_merge_col = df_merge_col[(df_merge_col.levering == 1)|(df_merge_col.splitsing == 1)|(df_merge_col.uitgifte == 1)]
df_merge_col = shuffle(df_merge_col)

In [None]:
#Save dataframe
df_merge_col.to_pickle('final_dataset.pickle')