In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import sys
import re
import itertools
import operator
from tqdm import tqdm
import os
import pickle
import seaborn as sns
from matplotlib import rc,rcParams

from gensim.sklearn_api import D2VTransformer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.utils import shuffle

base = os.getcwd().split('Master-Thesis')[0].replace('\\', '/')
sys.path.insert(0, base + '/Master-Thesis/research/pre-processing')

from pre_processing_functions import *
from model_functions import *

In [None]:
#Paths for final datset
path_dataset = base + '/Master-Thesis/research/pre-processing/final_dataset.pickle'

In [None]:
#Reading and defining data
df_final= pd.read_pickle(path_dataset)

#80/20 split train validation
df_final_validation = df_final[:int(0.8*len(df_final))] 
df_test = df_final[int(0.8*len(df_final)):]             
labels_test = df_test.check_relevant

In [None]:
#Hyperparameters for gridsearch function
scoring - ['f1', 'recall', 'precision']
cv = 4
refit = 'f1'

# TFIDF Grid Search

In [None]:
#Hyperparameters
N_FEATURES_OPTIONS = [100, 200, 500, 1000, 2500, 5000]
C_OPTIONS = [.1, 1, 10, 100, 1000, 10000]

In [None]:
#Pipeline 
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('dimred', SelectKBest(chi2)),
    ('logistic', LogisticRegression())
])

parameters = [{
    'logistic__C': C_OPTIONS,
    'dimred__k': N_FEATURES_OPTIONS 
}]

In [None]:
#Grid search and save results
grid_tfidf = GridSearchCV(pipe,parameters,scoring = scoring, cv=cv, refit = refit)
grid_tfidf.fit(np.array(texts), np.array(labels))

df_idf = gridSearch_to_df(grid_tfidf)
df_idf.to_pickle('grid_tfidf.pickle')

#  TFIDF with categorical and numerical features Grid Search

In [None]:
#Hyperparameters
N_FEATURES_OPTIONS = [100, 200, 500, 1000, 2500, 5000]
C_OPTIONS = [.1, 1, 10, 100, 1000, 10000]

In [None]:
#Pipeline 
model_pipeline_tfidf_f_dimred_tfidfonly = Pipeline(steps=[
  ("features", FeatureUnion([
    ("numerical_features", ColumnTransformer([("numerical",Pipeline(steps=[
                        ("impute_stage", SimpleImputer(missing_values=np.nan, strategy="median",)),
                        ('scaler', StandardScaler())]),
                        ["page", 'unique_words'])])), 
      
    ("categorical_features", ColumnTransformer([("type",Pipeline(steps=[
                        ('imputer', SimpleImputer(strategy='constant', missing_values=np.nan)),
                        ("ohe", OneHotEncoder(handle_unknown="ignore"))]),
                        ["uitgifte", 'splitsing','levering', 'year']),])),
      
      ("text_features",ColumnTransformer([("title_vec",Pipeline(steps=[
          ("tfidf", TfidfVectorizer()),
          ('dimred', SelectKBest(chi2))
      ]),"text_tokenized_joined")]))])),
 
    
  ("classifiers", LogisticRegression())
])


parameters = [{
    'classifiers__C': C_OPTIONS,
    'features__text_features__title_vec__dimred__k': N_FEATURES_OPTIONS 
}]


In [None]:
#Grid search and save results
grid_tfidf_features_dimr_tfidfonly = GridSearchCV(model_pipeline_tfidf_f_dimred_tfidfonly,param_grid = parameters, scoring = scoring, cv= cv, refit = refit)
grid_tfidf_features_dimr_tfidfonly.fit(df_final_val, df_final_val.check_relevant)

df_idf_features = gridSearch_to_df(grid_tfidf_features_dimr_tfidfonly)
df_idf_features.to_pickle('grid_tfidf_features.pickle')

# Doc2vec with stemming and stop word removal Grid Search

In [None]:
#Hyperparameters
C_OPTIONS = [1000]
N_SAMPLE_OPTIONS =  [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

In [None]:
#Pipeline
piped2v = Pipeline([
    ('doc2vec', Doc2VecTransformer(text = 'text_tokenized')), #or Doc2vecTransformer
    ('logistic', LogisticRegression())
])

parameters = [{
    'logistic__C': C_OPTIONS,
    'doc2vec__sample': N_SAMPLE_OPTIONS 
}]

In [None]:
#Grid search and save results
grid_d2v = GridSearchCV(piped2v,param_grid = parameters, scoring = scoring, cv= cv, refit = refit)
grid_d2v.fit(df_final_val, df_final_val.check_relevant)
df_d2v = gridSearch_to_df(grid_d2v)
df_d2v.to_pickle('grid_validation_d2v_stem_stop_sample.pickle')

# Doc2vec without stemming and stop word removal Grid Search

In [None]:
#Hyperparameters
C_OPTIONS = [1000]
N_SAMPLE_OPTIONS =  [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

In [None]:
#Pipeline
piped2v = Pipeline([
    ('doc2vec', Doc2VecTransformer()), 
    ('logistic', LogisticRegression())
])

parameters = [{
    'logistic__C': C_OPTIONS,
    'doc2vec__sample': N_SAMPLE_OPTIONS 
}]


In [None]:
#Grid search and save results
grid_d2v = GridSearchCV(piped2v,param_grid = parameters,  scoring = scoring, cv= cv, refit = refit)
grid_d2v.fit(df_final_val, df_final_val.check_relevant, )
df_d2v = gridSearch_to_df(grid_d2v)
df_d2v.to_pickle('grid_validation_d2v_NO_stem_stop_sample.05.pickle')

# Pipeline Doc2vec with categorical and numerical features Grid Search

In [None]:
#Hyperparameters
C_OPTIONS = [1000]
N_SAMPLE_OPTIONS =  [0.1]

In [None]:
#Pipeline
model_pipeline_d2v_f = Pipeline(steps=[
  ("features", FeatureUnion([
    ("numerical_features", ColumnTransformer([("numerical",Pipeline(steps=[
                        ("impute_stage", SimpleImputer(missing_values=np.nan, strategy="median",)),
                        ('scaler', StandardScaler())]),
                        ["page", 'unique_words'])])), 
      
    ("categorical_features", ColumnTransformer([("type",Pipeline(steps=[
                        ('imputer', SimpleImputer(strategy='constant', missing_values=np.nan)),
                        ("ohe", OneHotEncoder(handle_unknown="ignore"))]),
                        ["uitgifte", 'splitsing','levering', 'year']),])),
    
    ("text_features", Pipeline(steps=[
        ("doc2vec", Doc2VecTransformer(text = 'text_tokenized')),]),)])),
        
  ("classifiers", LogisticRegression())
])

parameters = [{
    'classifiers__C': C_OPTIONS,
    'features__text_features__doc2vec__sample': N_SAMPLE_OPTIONS 
}]


In [None]:
#Grid search and save results
d2vgrid_features_vec_size = GridSearchCV(model_pipeline_d2v_f,param_grid = parameters, scoring = scoring, cv= cv, refit = refit)
d2vgrid_features_vec_size.fit(df_final_val, np.array(df_final_val.check_relevant))
d2vgrid_features_vec_size_df = gridSearch_to_df(d2vgrid_features_vec_size)
d2vgrid_features_vec_size_df.to_pickle('grid_d2vtt_features_sample.pickle')

# Character Ngrams Grid Search

In [None]:
#Hyperparameters
N_FEATURES_OPTIONS = [300, 500, 1000, 2500, 5000]
C_OPTIONS = [.1,1,10, 100, 1000, 10000]

In [None]:
#Pipeline
pipe_grid_ngram = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range = (3,3), analyzer = 'char')), #or Doc2vecTransformer
    ('dimred', SelectKBest(chi2)),
    ('logistic', LogisticRegression())
])

parameters = [{
    'logistic__C': C_OPTIONS,
    'dimred__k': N_FEATURES_OPTIONS 
}]


In [None]:
#Grid search and save results
grid_ngram_kbest = GridSearchCV(pipe_grid_ngram,param_grid = parameters, scoring = scoring, cv= cv, refit = refit)
grid_ngram_kbest.fit(df_final_val.text_ngrams, labels)
df_ngram = gridSearch_to_df(grid_ngram_kbest)
df_ngram.to_pickle('grid_ngram.pickle')

# Character Ngrams with categorical and numerical features Grid Search

In [None]:
#Hyperparameters
N_FEATURES_OPTIONS = [300, 500, 1000, 2500, 5000]
C_OPTIONS = [.1,1,10, 100, 1000, 10000]

In [None]:
#Pipeline
model_pipeline_ngram_f_dimred = Pipeline(steps=[
  ("features", FeatureUnion([
    ("numerical_features", ColumnTransformer([("numerical",Pipeline(steps=[
                        ("impute_stage", SimpleImputer(missing_values=np.nan, strategy="median",)),
                        ('scaler', StandardScaler())]),
                        ["page", 'unique_words'])])), 
      
    ("categorical_features", ColumnTransformer([("type",Pipeline(steps=[
                        ('imputer', SimpleImputer(strategy='constant', missing_values=np.nan)),
                        ("ohe", OneHotEncoder(handle_unknown="ignore"))]),
                        ["uitgifte", 'splitsing','levering','year']),])),
    
      ("text_features",ColumnTransformer([("title_vec",Pipeline(steps=[
          ('tfidf', TfidfVectorizer(ngram_range = (3,3), analyzer = 'char')),
          ("dimred", SelectKBest(chi2))])                                          
        ,"text_ngrams")]))])),
    
    
  ("classifiers", LogisticRegression())
])


parameters = [{
    'classifiers__C': C_OPTIONS,
    'features__text_features__title_vec__dimred__k': N_FEATURES_OPTIONS 
}]

In [None]:
#Grid search and save results
grid_ngram_features_dimr_tfidfonly = GridSearchCV(model_pipeline_ngram_f_dimred,param_grid = parameters, scoring = scoring, cv= cv, refit = refit)
grid_ngram_features_dimr_tfidfonly.fit(df_final_val, df_final_val.check_relevant)
df_ngram_grid_dimr_tfidf = gridSearch_to_df(grid_ngram_features_dimr_tfidfonly)
df_ngram_grid_dimr_tfidf.to_pickle('grid_ngram_features.pickle')