In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import sys
import re
import itertools
import operator
from tqdm import tqdm
import os
import pickle
import seaborn as sns
from matplotlib import rc,rcParams

from gensim.sklearn_api import D2VTransformer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.utils import shuffle

base = os.getcwd().split('Master-Thesis')[0].replace('\\', '/')
sys.path.insert(0, base + '/Master-Thesis/research/pre-processing')

from pre_processing_functions import *
from model_functions import *

In [None]:
#Paths for final datset
path_dataset = base + '/Master-Thesis/research/pre-processing/final_dataset.pickle'

In [None]:
#Reading and defining data
df_final= pd.read_pickle(path_dataset)

#80/20 split train validation
df_final_validation = df_final[:int(0.8*len(df_final))] 
df_test = df_final[int(0.8*len(df_final)):]             
labels_test = df_test.check_relevant

## RQ 2 Different Vectorization Methods

### TF-IDF

In [None]:
#Hyperparameters
k_value = 5000

In [None]:
pipe_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dimred', SelectKBest(chi2, k=k_value)),
    ('logistic', LogisticRegression())
])

In [None]:
pipe_tfidf.fit(df_final_validation.text_tokenized_joined, df_final_validation.check_relevant)

In [None]:
prediction = pipe_tfidf.predict(df_test.text_tokenized_joined)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)

### Ngram

In [None]:
#Hyperparameters
k_value = 5000

In [None]:
pipe_grid_ngram = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range = (4,4), analyzer = 'char')), #or Doc2vecTransformer
    ('dimred', SelectKBest(chi2, k = k_value)),
    ('logistic', LogisticRegression())
])


In [None]:
pipe_grid_ngram.fit(df_final_validation.text_ngrams, df_final_validation.check_relevant)

In [None]:
prediction = pipe_grid_ngram.predict(df_test.text_ngrams)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)

### D2V

In [None]:
#Hyperparameters
C_value = 1000

In [None]:
piped2v = Pipeline([
    ('doc2vec', Doc2VecTransformer(text = 'text_tokenized')), #or Doc2vecTransformer
    ('logistic', LogisticRegression(C = C_value))
])

In [None]:
piped2v.fit(df_final_validation, df_final_validation.check_relevant)

In [None]:
prediction = piped2v.predict(df_test)
f1 = f1_score(labels_test, prediction, pos_label=True)
recall = recall_score(labels_test, prediction,  average="binary", pos_label=True)
precision = precision_score(labels_test, prediction,  average="binary", pos_label=True)