In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import sys
import re
import itertools
import operator
from tqdm import tqdm
import os
import pickle
import seaborn as sns
from matplotlib import rc,rcParams

from gensim.sklearn_api import D2VTransformer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.utils import shuffle

base = os.getcwd().split('Master-Thesis')[0].replace('\\', '/')
sys.path.insert(0, base + '/Master-Thesis/research/pre-processing')

from pre_processing_functions import *
from model_functions import *

In [None]:
#Paths for final datset
path_dataset = base + '/Master-Thesis/research/pre-processing/final_dataset.pickle'

In [None]:
#Reading and defining data
df_final= pd.read_pickle(path_dataset)

#80/20 split train validation
df_final_validation = df_final[:int(0.8*len(df_final))] 
df_test = df_final[int(0.8*len(df_final)):]             
labels_test = df_test.check_relevant

## RQ3 Different vectorization methods with categorical and numerical features

### TF-IDF

In [None]:
#Hyperparameters
k_value = 2500
C_value = 10

In [None]:
#Pipeline
model_pipeline_tfidf = Pipeline(steps=[
  ("features", FeatureUnion([
    ("numerical_features", ColumnTransformer([("numerical",Pipeline(steps=[
                        ("impute_stage", SimpleImputer(missing_values=np.nan, strategy="median",)),
                        ('scaler', StandardScaler())]),
                        ["page", 'unique_words'])])), 
      
    ("categorical_features", ColumnTransformer([("type",Pipeline(steps=[
                        ('imputer', SimpleImputer(strategy='constant', missing_values=np.nan)),
                        ("ohe", OneHotEncoder(handle_unknown="ignore"))]),
                        ["uitgifte", 'splitsing','levering', 'year']),])),
      
      ("text_features",ColumnTransformer([("title_vec",Pipeline(steps=[
          ("tfidf", TfidfVectorizer()),
          ('dimred', SelectKBest(chi2, k=k_value))
      ]),"text_tokenized_joined")]))])),
 
    
  ("classifiers", LogisticRegression(C=C_value))
])


In [None]:
#Fit and predict
model_pipeline_tfidf.fit(df_final_validation, df_final_validation.check_relevant)
prediction = model_pipeline_tfidf.predict(df_test)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)
pd.DataFrame([prediction]).to_csv('predictiontfidf.csv')

## Ngram

In [None]:
#Hyperparameters
k_value = 2500
C_value = 100

In [None]:
#Pipeline
model_pipeline_ngram_f_dimred = Pipeline(steps=[
  ("features", FeatureUnion([
    ("numerical_features", ColumnTransformer([("numerical",Pipeline(steps=[
                        ("impute_stage", SimpleImputer(missing_values=np.nan, strategy="median",)),
                        ('scaler', StandardScaler())]),
                        ["page", 'unique_words'])])), 
      
    ("categorical_features", ColumnTransformer([("type",Pipeline(steps=[
                        ('imputer', SimpleImputer(strategy='constant', missing_values=np.nan)),
                        ("ohe", OneHotEncoder(handle_unknown="ignore"))]),
                        ["uitgifte", 'splitsing','levering', 'year']),])),
    
      ("text_features",ColumnTransformer([("title_vec",Pipeline(steps=[
          ('tfidf', TfidfVectorizer(ngram_range = (3,3), analyzer = 'char')),
          ("dimred", SelectKBest(chi2, k=k_value))])                                          
        ,"text_ngrams")]))])),
    
  ("classifiers", LogisticRegression(C=C_value))
])

In [None]:
#Fit and predict
model_pipeline_ngram_f_dimred.fit(df_final_validation, df_final_validation.check_relevant)
prediction = model_pipeline_ngram_f_dimred.predict(df_test)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)
pd.DataFrame([prediction]).to_csv('predictionngram.csv')

## Doc2vec

In [None]:
#Hyperparameters
sample_value = .13
C_value = 1000

In [None]:
#Pipeline
model_pipeline_d2v_features = Pipeline(steps=[
  ("features", FeatureUnion([
    ("numerical_features", ColumnTransformer([("numerical",Pipeline(steps=[
                        ("impute_stage", SimpleImputer(missing_values=np.nan, strategy="median",)),
                        ('scaler', StandardScaler())]),
                        ["page", 'unique_words'])])), 
      
    ("categorical_features", ColumnTransformer([("type",Pipeline(steps=[
                        ('imputer', SimpleImputer(strategy='constant', missing_values=np.nan)),
                        ("ohe", OneHotEncoder(handle_unknown="ignore"))]),
                        ["uitgifte", 'splitsing','levering', 'year']),])),
    
    ("text_features", Pipeline(steps=[
        ("doc2vec", Doc2VecTransformer(sample = .sample_value)),]),)])),
        
  ("classifiers", LogisticRegression(C=C_value))
])

In [None]:
#Fit and predict
model_pipeline_d2v_features.fit(df_final_validation, df_final_validation.check_relevant)
prediction = model_pipeline_d2v_features.predict(df_test)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)
pd.DataFrame([prediction]).to_csv('predictiond2v.csv')

## Question 4 Performance accross different document types

In [None]:
#Create dict to store the different performances for plotting
dict_compare = {'precision':[], 'recall':[], 'f1':[], 'Vectorization Method':[], 'Type':[]}

## Uitgifte

In [None]:
#uitgifte df
df_test_uitgifte = df_test[(df_test.levering != 1)& (df_test.splitsing != 1)& (df_test.uitgifte == 1)]
labels_test = df_test_uitgifte.check_relevant
doc_type = 'uitgifte'

#### TF-IDF

In [None]:
prediction = model_pipeline_tfidf.predict(df_test_uitgifte)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)

dict_compare['precision'].append(precision)
dict_compare['recall'].append(recall)
dict_compare['f1'].append(f1)
dict_compare['Type'].append(doc_type)
dict_compare['Vectorization Method'].append('TF-IDF')

#### Ngram

In [None]:
prediction = model_pipeline_ngram_f_dimred.predict(df_test_uitgifte)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)

dict_compare['precision'].append(precision)
dict_compare['recall'].append(recall)
dict_compare['f1'].append(f1)
dict_compare['Type'].append(doc_type)
dict_compare['Vectorization Method'].append('TRIGRAM')

#### D2V

In [None]:
prediction = model_pipeline_d2v_features.predict(df_test_uitgifte)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)

dict_compare['precision'].append(precision)
dict_compare['recall'].append(recall)
dict_compare['f1'].append(f1)
dict_compare['Type'].append(doc_type)
dict_compare['Vectorization Method'].append('DBOW')

## Splitsing

In [None]:
#Splitsing df
df_test_splitsing = df_test[(df_test.levering != 1)& (df_test.splitsing == 1)& (df_test.uitgifte != 1)]
labels_test = df_test_splitsing.check_relevant
doc_type = 'splitsing'

#### TF-IDF

In [None]:
prediction = model_pipeline_tfidf.predict(df_test_splitsing)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)

dict_compare['precision'].append(precision)
dict_compare['recall'].append(recall)
dict_compare['f1'].append(f1)
dict_compare['Type'].append(doc_type)
dict_compare['Vectorization Method'].append('TF-IDF')

#### Ngram

In [None]:
prediction = model_pipeline_ngram_f_dimred.predict(df_test_splitsing)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)

dict_compare['precision'].append(precision)
dict_compare['recall'].append(recall)
dict_compare['f1'].append(f1)
dict_compare['Type'].append(doc_type)
dict_compare['Vectorization Method'].append('TRIGRAM')

#### D2V

In [None]:
prediction = model_pipeline_d2v_features.predict(df_test_splitsing)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)


dict_compare['precision'].append(precision)
dict_compare['recall'].append(recall)
dict_compare['f1'].append(f1)
dict_compare['Type'].append(doc_type)
dict_compare['Vectorization Method'].append('DBOW')

## Levering

In [None]:
#Levering df
df_test_levering = df_test[(df_test.levering == 1)& (df_test.splitsing != 1)& (df_test.uitgifte != 1)]
labels_test = df_test_levering.check_relevant
doc_type = 'levering'

#### TF-IDF

In [None]:
prediction = model_pipeline_tfidf.predict(df_test_levering)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)


dict_compare['precision'].append(precision)
dict_compare['recall'].append(recall)
dict_compare['f1'].append(f1)
dict_compare['Type'].append(doc_type)
dict_compare['Vectorization Method'].append('TF-IDF')

#### Ngram

In [None]:
prediction = model_pipeline_ngram_f_dimred.predict(df_test_levering)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)

dict_compare['precision'].append(precision)
dict_compare['recall'].append(recall)
dict_compare['f1'].append(f1)
dict_compare['Type'].append(doc_type)
dict_compare['Vectorization Method'].append('TRIGRAM')

#### D2V

In [None]:
prediction = model_pipeline_d2v_features.predict(df_test_levering)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)

dict_compare['precision'].append(precision)
dict_compare['recall'].append(recall)
dict_compare['f1'].append(f1)
dict_compare['Type'].append(doc_type)
dict_compare['Vectorization Method'].append('DBOW')

In [None]:
q4 = pd.DataFrame.from_dict(dict_compare)
q4.to_pickle('q4.pickle')