In [30]:
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import sys
import re
import itertools
import operator
from tqdm import tqdm
import os
import pickle
import seaborn as sns

from gensim.sklearn_api import D2VTransformer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion

base = os.getcwd().split('Master-Thesis')[0].replace('\\', '/')
sys.path.insert(0, base + '/Master-Thesis/research/pre-processing')

from pre_processing_functions import *
from model_functions import *

In [31]:
#Paths of the annotations and dataset
path_df_anno = base + '/Master-Thesis/research/pre-processing/complete_annotations.pickle'
path_dataset = base + '/Master-Thesis/research/pre-processing/final_dataset.pickle'

In [32]:
#Read the data and annotations in, as well as define the test set
df_annotations = pd.read_pickle(path_df_anno)

df_final= pd.read_pickle(path_dataset)
df_test = df_final[int(0.8*len(df_final)):]

In [33]:
#Read the predictions of RQ3.ipynb that were saved for analysis
prediction_ngram = np.array(pd.read_csv('predictionngram.csv'))[0][1:]
prediction_tfidf = np.array(pd.read_csv('predictiontfidf.csv'))[0][1:]
prediction_d2v = np.array(pd.read_csv('predictiond2v.csv'))[0][1:]
labels_test = df_test.check_relevant

## False Negative and True Positive 

In [None]:
#Check whether a prediction is a TP or a FN for all methods and save index to list
TP_ngram, FN_ngram = TPFN(prediction_ngram, labels_test)
TP_tfidf, FN_tfidf = TPFN(prediction_tfidf, labels_test)
TP_d2v, FN_d2v = TPFN(prediction_d2v, labels_test)

In [None]:
#Find the overlap of all the TP and FN between the three different vectorization methods
all_FN_list = list(set([FN_d2v, FN_tfidf, FN_ngram][0]).intersection(*[FN_d2v, FN_tfidf, FN_ngram]))
all_TP_list = list(set([TP_d2v, TP_tfidf, TP_ngram][0]).intersection(*[TP_d2v, TP_tfidf, TP_ngram]))

In [None]:
#Select the rows of the dataframe based on the indexes as determined earlier for TP and FN seperately
all_FN_df = df_test.iloc[all_FN_list,:]
all_TP_df = df_test.iloc[all_TP_list,:]

In [None]:
#Create seperate datataframes for the different document types for the TP and FN dataframe
all_TP_df_uitgifte = all_TP_df[(all_TP_df.levering != 1)& (all_TP_df.splitsing != 1)& (all_TP_df.uitgifte == 1)]
all_TP_df_splitsing = all_TP_df[(all_TP_df.levering != 1)& (all_TP_df.splitsing == 1)& (all_TP_df.uitgifte != 1)]
all_TP_df_levering = all_TP_df[(all_TP_df.levering == 1)& (all_TP_df.splitsing != 1)& (all_TP_df.uitgifte != 1)]

all_FN_df_uitgifte = all_FN_df[(all_FN_df.levering != 1)& (all_FN_df.splitsing != 1)& (all_FN_df.uitgifte == 1)]
all_FN_df_splitsing = all_FN_df[(all_FN_df.levering != 1)& (all_FN_df.splitsing == 1)& (all_FN_df.uitgifte != 1)]
all_FN_df_levering = all_FN_df[(all_FN_df.levering == 1)& (all_FN_df.splitsing != 1)& (all_FN_df.uitgifte != 1)]

In [None]:
#Merge the individual highlights with corresponding pages from dataset
df_annotations['filename'] = df_annotations['filename'].apply(lambda x: x.strip().lower())
mergedFN= pd.merge(all_FN_df, df_annotations, on=['filename', 'page'], how='inner')
mergedTP= pd.merge(all_TP_df, df_annotations, on=['filename', 'page'], how='inner')

## Plots for error analysis

In [None]:
#Makes plots nicer
sns.set(style="whitegrid",font_scale=1.9)
plt.rc('text', usetex=True)  
plt.rc('font', family='serif') 

In [None]:
#Plots the distributions of the pages for TP and FN
sns.distplot(all_FN_df.page, label = 'False Negatives')
sns.distplot(all_TP_df.page,label = 'True Positives')

plt.ylabel('Density', fontsize = 25)
plt.xlabel('Page',fontsize = 22, y = -2)
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)
plt.legend(loc='upper right',fontsize = 15)
plt.xlim(0, 50)
plt.savefig('page_EA.png', bbox_inches='tight')

In [None]:
#Plots the distributions of the amount of unique words for TP and FN
sns.distplot(all_FN_df.unique_words, label = 'False Negatives')
sns.distplot(all_TP_df.unique_words,label = 'True Positives')

plt.ylabel('Density', fontsize = 25)
plt.xlabel('Unique Words',fontsize = 22, y = -2)
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)
plt.legend(loc='upper left',fontsize = 15)
plt.savefig('uniquewords_EA.png', bbox_inches='tight')

In [None]:
#Group dataframes based on the date for plot
count_FN_year = mergedFN.groupby(['date']).size().to_frame('count').reset_index()
count_TP_year = mergedTP.groupby(['date']).size().to_frame('count').reset_index()

#Plot the distribution of year by plotting the grouped dataframes
sns.distplot([int(x) for x in list(all_FN_df.year) if x != 'No y'], label = 'False Negatives')
sns.distplot([int(x) for x in list(all_TP_df.year) if x != 'No y'],label = 'True Positives')

plt.ylabel('Density', fontsize = 25)
plt.xlabel('Year',fontsize = 22, y = -2)
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)
plt.legend(loc='upper left',fontsize = 15)
plt.savefig('year_EA.png', bbox_inches='tight')

In [None]:
#Group dataframes based on the amount of highlights for plot
count_FN_highlight = mergedFN.groupby(['filename','page']).size().to_frame('count').reset_index()
count_TP_highlight = mergedTP.groupby(['filename','page']).size().to_frame('count').reset_index()

#Plot distribution amount of highlights for FN and TP
sns.distplot(np.array(count_FN_highlight['count']), label = 'False Negatives', bins = 10)
sns.distplot(np.array(count_TP_highlight['count']),label = 'True Positives', bins = 13)

plt.ylabel('Density', fontsize = 25, fontweight='bold')
plt.xlabel('Amount of Highlights on a page',fontsize = 22, y = -2)
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)
plt.legend(loc='upper right',fontsize = 15)
plt.savefig('highlights_EA.png', bbox_inches='tight')