In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import sys
import re
import itertools
import operator
from tqdm import tqdm
import os
import pickle
import seaborn as sns
from matplotlib import rc,rcParams

from gensim.sklearn_api import D2VTransformer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.utils import shuffle

base = os.getcwd().split('Master-Thesis')[0].replace('\\', '/')
sys.path.insert(0, 'C:/Users/tom_v/Documents/Master/Thesis/Master-Thesis/research/pre-processing')

from pre_processing_functions import *
from model_functions import *

In [None]:
#Paths for final datset
path_dataset = base + '/Master-Thesis/research/pre-processing/final_dataset.pickle'
path_q4_results = 'q4.pickle'

In [None]:
#Reading and defining data
df_final= pd.read_pickle(path_dataset)

#80/20 split train validation
df_final_validation = df_final[:int(0.8*len(df_final))] 
df_test = df_final[int(0.8*len(df_final)):]             
labels_test = df_test.check_relevant

In [None]:
# Read results from RQ4
q4 = pd.read_pickle(path_q4_results)
plt.rc('text', usetex=True)  
plt.rc('font', family='serif') 

## Plots for RQ 4

In [None]:
#Plot for RQ4 in the paper for different document types
sns.set(style="whitegrid",font_scale=1.9)
g = sns.catplot(x="Type", y="precision", hue="Vectorization Method", data=q4,
                height=6, kind="bar", palette="muted")
g.despine(left=True)

plt.yticks(fontsize=25)
plt.xticks(fontsize=25)

g.set_ylabels("Precision", fontsize = 30)
g.set_xlabels("Type", fontsize = 27)
g.savefig("q4_precision.png")

## Plots for Chi2 features

In [None]:
#Plots the word features with the highest scores for the TFIDF vectorization method
vectorizer = TfidfVectorizer()
df_uitgifte = df_final[(df_final.levering != 1)& (df_final.splitsing != 1)& (df_final.uitgifte == 1)]

X_train_features = vectorizer.fit_transform(df_uitgifte.text_tokenized_joined)
y_train_labels = df_uitgifte.check_relevant

ch2 = SelectKBest(chi2, k = 20)
X_train_features = ch2.fit_transform(X_train_features, y_train_labels)
top_ranked_scores = [x[1]  for x in sorted(enumerate(ch2.scores_),key=lambda x:x[1], reverse=True)[:20]]
top_ranked_terms = list(np.asarray(vectorizer.get_feature_names())[ch2.get_support()])

labels = top_ranked_terms
values = top_ranked_scores
indexes = np.arange(len(labels))

width = 0.5
plt.figure(figsize=(20,5))
plt.bar(indexes, values, width)
plt.xticks(indexes + width * 0.5, labels, rotation='vertical')

plt.xlabel('Term', fontsize=40)
plt.ylabel(u"$\chi^2$ Score", fontsize=40)
plt.yticks(fontsize=30)
plt.xticks(fontsize=35, rotation =45, ha = 'right')
plt.savefig('q4_uitgifte_chi.png',bbox_inches='tight')
plt.show()

### EDA plots for final dataset

In [None]:
#Eda distribution plots for the processed dataset for different labels. 

df_check_true = df_final[df_final['check_relevant']==True]
df_check_false = df_final[df_final['check_relevant']==False]

sns.set(style="whitegrid",font_scale=1.9)

#In this case, the page feature is plotted. However, unique words can also be plotted instead
number_of_pages = sorted(np.array(df_check_true.page))
number_of_pagest = sorted(np.array(df_check_false.page))
sns.distplot(number_of_pagest, label = 'FALSE')
sns.distplot(number_of_pages, label  = 'TRUE')

plt.ylabel('Density', fontsize = 25)
plt.xlabel('Page Number',fontsize = 22, y = -2)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.xlim(0, 150)
plt.legend(loc='upper right',fontsize = 15)
plt.savefig('eda_page.png', bbox_inches='tight')