# Notebook 3: Feature Engineering

This notebook presents the execution of the various feature engineering methods that were suggested in the thesis sections 3.3 to 3.5.

After loading the preprocessed text from notebook #1, the feature engineering techniques are applied with varying document input sizes (notebook section 3.1). Based on the results, notebook section 3.2 presents an ensemble feature selection approach to construct a robust feature set. After applying the selected TF-IDF 1 method for feature extraction, the features are selected by the Boruta method and the average importance ranking of the remaining feature selection methods. Finally, notebook section 3.2.3 presents the saving of the transformed datasets which are subsequently used for the model building in notebook #4.

The results are reported in the thesis section 4.3.1.

Table of Contents:
* [3.1 Evaluate feature engineering methods over different input sizes](#sizes)
    * [3.1.1 Define functions for analysis](#define)
    * [3.1.2 Execution of methods](#execution)
    * [3.1.3 Print results of analysis](#print)
    
* [3.2 Ensemble feature selection technique](#ensemble)    
    * [3.2.1 TF-IDF 1 text extraction](#extract)
    * [3.2.2 Ensemble feature selection](#selection)
    * [3.2.3 Saving transformed datasets as pickles](#pickles)

In [None]:
# loading required modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from tqdm import tqdm_notebook
import seaborn as sns
from collections import Counter
import string
import pickle
import re
from copy import deepcopy
from memory_profiler import memory_usage
%load_ext memory_profiler
import openpyxl
from datetime import datetime
import statistics 
from scipy.sparse import csc_matrix, csr_matrix
from scipy.stats import rankdata
import pytz

import nltk
from nltk.corpus import stopwords, words, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import WordNetLemmatizer, pos_tag, word_tokenize
from nltk.corpus.reader.wordnet import WordNetError

from sklearn import utils
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2, RFECV, SelectFromModel

import gensim
from gensim.sklearn_api import TfIdfTransformer
from gensim import corpora
from gensim.matutils import corpus2csc
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim import models
from gensim.corpora import Dictionary
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

from boruta import BorutaPy

# define functions for loading and saving obejcts
def save_pickle(objectname, picklename):
    pickle_out = open(picklename,"wb") #.pickle
    pickle.dump(objectname, pickle_out)
    pickle_out.close()
    print(picklename, 'successfully pickled.') # e.g. save_pickle(contracts_labeled, 'Pickles/contracts_labeled.pickle')
def load_pickle(picklename):
    pickle_in = open(picklename,"rb")
    return pickle.load(pickle_in) # e.g. contracts_labeled = load_pickle('Pickles/contracts_labeled.pickle')

np.seterr(divide='ignore', invalid='ignore')

In [None]:
# load data 
# load labels 
# load data_unlabeled 
 
# ensure correct loading
print(len(data), len(labels))
print(len(data_unlabeled))

# 3.1 Evaluate feature engineering methods over different input sizes <a id="sizes"></a>

## 3.1.1 Define functions for analysis <a id="define"></a>

In [None]:
# function to return the name of object
def namestr(obj):
    return [name for name in globals() if globals()[name] is obj]

# function to retrieve stratified training and validation sample for certain document size
def stratified_data(number):
    data['Labels'] = labels
    data_grouped = data.groupby('Labels', group_keys=False)
    
    df_stratified = pd.DataFrame(columns = data.columns)
    counts = stratified_counts(number) # retrieve absolute number of samples per class (stratified)
    if number == len(data):
        counts.update({k: round(v* 0.85) for k, v in counts.items()}) #ensure that there is still df_validation if all documents are selected
    for i in list(counts.keys()):
        amount = counts.get(i)
        data_i = data[data['Labels']== i]
        df_st = data_i.sample(max(amount, 1))
        df_stratified = df_stratified.append(df_st)
    
    data_updated = data.drop(df_stratified.index.values) # ensure that documents are not selected in both subsets
    df_validation = pd.DataFrame(columns = data.columns)
    if number == len(data):
        df_validation = data_updated #remaining data as validation data
    else:      
        counts_val = stratified_counts(number*0.15)
        for i in list(counts_val.keys()):
            amount = counts_val.get(i)
            data_iv = data_updated[data_updated['Labels']== i]
            df_val = data_iv.sample(max(amount, 1))
            df_validation = df_validation.append(df_val)
            
    return df_stratified, df_validation

# function to evaluate absolute number of samples from each class under the condition of stratification
def stratified_counts(number):
    c = sorted(Counter(labels).items())
    percentages = dict((x,y) for x, y in c)
    percentages.update({k: v/len(labels) for k, v in percentages.items()})
    d = dict((k, round(v*number)) for k, v in percentages.items())
    return d

# function to define minimum frequency of term in analysis to be included in vectorization
def min_df(data, percentage):
    min_df_c = len(data) * percentage
    return int(round(min_df_c))

# function to save folds for consistent usage
def folds(data_X, data_y, n):
    kf = StratifiedKFold(n_splits=n, shuffle = True, random_state = 42)
    trains = list()
    valids = list()
    for train_ids, valid_ids in kf.split(data_X,  data_y):
        trains.append(train_ids)
        valids.append(valid_ids)
    return trains, valids, kf

# function to retrieve the memory usage
def get_memory(var):
    find_regex = re.compile(r'\d+.\d+')
    reg = find_regex.search(var)
    return float(reg.group()) 

In [None]:
# feature extraction using the Bag-of-Words method (thesis section 3.3.1)
def bow(column):
    data = dataset[column]
    count = CountVectorizer(analyzer= 'word', ngram_range = (1, 2), min_df = min_df_c, max_features = None)  
    count_vectorizer_best = count.fit(data)
    dataset_count = count_vectorizer_best.transform(data)
    features = len(count_vectorizer_best.get_feature_names())
    validset_count = count_vectorizer_best.transform(validset[column])
    return [dataset_count, features, validset_count, count]

In [None]:
# feature extraction using the TF-IDF 1 method (thesis section 3.3.2)
def tf_idf_1(column):
    data = dataset[column]
    tfidf = TfidfVectorizer(analyzer= 'word', ngram_range = (1, 2), min_df = min_df_c, max_features = None, norm = 'l2', smooth_idf = True, sublinear_tf =True)  
    tfidf_vectorizer_best = tfidf.fit(data)
    dataset_tfidf = tfidf_vectorizer_best.transform(data)
    validset_tfidf = tfidf_vectorizer_best.transform(validset[column])
    features = len(tfidf_vectorizer_best.get_feature_names())
    return [dataset_tfidf, features, validset_tfidf, tfidf]

In [None]:
# feature extraction using the TF-IDF 2 method (thesis section 3.3.2)

def tf_idf_2_pivot(data_X, column):
    terms_number = list()
    for i in range(len(data_X)):
        terms_number.append(len(set(data_X[column][i].split())))
    return statistics.mean(terms_number)

def tf_idf_2(column, avg_pivot, folds_train, folds_valid):
    # evaluate the best slope by cross-validation
    best_score = 0
    token_ = [text.split() for text in dataset[column]]
    bigram = Phrases(token_, min_count=min_df_c)
    bigram_phraser = Phraser(bigram)
    bigram_token = []
    for sent in token_:
        bigram_token.append(bigram_phraser[sent])
    dic = Dictionary(bigram_token)
    dic.filter_extremes(no_below = min_df_c)
    corpus = [dic.doc2bow(text) for text in bigram_token]
    corpus = np.array(corpus)
    clf = RandomForestClassifier(n_estimators = 100, n_jobs=-1, random_state = 42, class_weight='balanced')
    for slope in np.arange(0, 1.1, 0.1):
        tfidf = TfIdfTransformer(normalize = True, smartirs = 'Ltu', pivot = avg_pivot, slope= slope)
        scores = list()
        for index in range(len(folds_train)):
            train_ids = folds_train[index]
            valid_ids = folds_valid[index]
            cv_clf = deepcopy(clf)
            train_X, train_y, valid_X, valid_y = corpus[train_ids], dataset_y[train_ids], corpus[valid_ids], dataset_y[valid_ids]
            tfidf_vectorizer = tfidf
            tfidf_vectorizer = tfidf_vectorizer.fit(list(train_X))
            train_X_tfidf = corpus2csc(tfidf_vectorizer.transform(train_X), num_terms = len(dic)).T
            valid_X_tfidf = corpus2csc(tfidf_vectorizer.transform(valid_X), num_terms = len(dic)).T
            cv_clf.fit(train_X_tfidf, train_y)
            pred = cv_clf.predict(valid_X_tfidf)
            scores.append(f1_score(valid_y, pred, average = 'weighted', labels=np.unique(valid_y)))
        results = statistics.mean(scores)
        if results > best_score:
            best_transformer = tfidf # transformer with best slope 
    
    # retrieve tokens from text
    token_v = [text.split() for text in validset[column]]
    bigram_v = Phrases(token_v) 
    bigram_phraser_v = Phraser(bigram_v)
    bigram_token_v = []
    for sent in token_v:
        bigram_token_v.append(bigram_phraser_v[sent])
    corpus_v = [dic.doc2bow(text) for text in bigram_token_v]
    corpus_v = np.array(corpus_v)

    # using selected best transformer for final feature extraction
    tfidf_best = best_transformer
    tfidf_vectorizer_best = tfidf_best.fit(list(corpus))
    dataset_tfidf = corpus2csc(tfidf_vectorizer_best.transform(corpus), num_terms = len(dic)).T
    validset_tfidf = corpus2csc(tfidf_vectorizer_best.transform(corpus_v), num_terms = len(dic)).T
    features = len(dic)
    return [dataset_tfidf, features, validset_tfidf, best_transformer]

In [None]:
# feature extraction using the Doc2Vec method (thesis section 3.3.3)

def d2v(column):
    data = dataset[column]
    data_set = [TaggedDocument(sentence.split(), [tag]) for sentence, tag in zip(data, dataset_y)]
    valid = validset[column]
    
    model_dbow = Doc2Vec(data_set, vector_size=500,min_count=min_df_c, dm=0, epchs = 10)
    model_dm = Doc2Vec(data_set, vector_size=500,min_count=min_df_c, dm=1, dm_mean=1, epochs = 10)
    d2v_model = ConcatenatedDoc2Vec([model_dbow, model_dm]) #concatenating dbow and dm model as recommended in paper

    # infer embeddings for data and validation set
    predictors_data = []
    for sentence in data:
        predictor = d2v_model.infer_vector(sentence.split())
        predictors_data.append(predictor.tolist())

    predictors_valid = []
    for sentence in valid:
        predictor = d2v_model.infer_vector(sentence.split())
        predictors_valid.append(predictor.tolist())
    
    features = len(predictors_data[0])
    return [predictors_data, features, predictors_valid]

In [None]:
# feature projection using the Principal Component Analysis (thesis section 3.4.1)

def PrincipalComponent(data_X, variance_percentage, valid_data):
    d = np.array(data_X.todense())
    v = np.array(valid_data.todense())
    pca = PCA(n_components = variance_percentage, random_state = 42).fit(d)
    data_pca = pca.transform(d)
    valid_pca = pca.transform(v)
    return [data_pca, data_pca.shape[1], valid_pca]

In [None]:
# feature projection using the Linear Discriminant Analysis (thesis section 3.4.2)

def LinDisAn(data, variance_percentage, valid):
    n_comp = LDA_select_n(data, variance_percentage) # evaluate number of dimensions (see below)
    lda = LDA(n_components = n_comp).fit(data, dataset_y)
    data_lda = lda.transform(data)
    valid_lda = lda.transform(valid)
    return data_lda, data_lda.shape[1], valid_lda

# retrieve number of dimensions required to achieve the given percentage of explained variance
def LDA_select_n(data, goal_var):
    lda =  LDA(n_components = None)
    features_lda = lda.fit(data, dataset_y)
    lda_var_ratios = lda.explained_variance_ratio_
    total_variance = 0.0
    n_component = 0
    for explained_variance in lda_var_ratios: #successively adding a dimension to increase the percentage of explained variance
        total_variance += explained_variance
        n_component += 1
        if total_variance >= goal_var:
            break
    return n_component

In [None]:
# feature selection using the chi-square test (thesis section 3.5.1)

def ChiSquare(data_X, data_y, folds_train, folds_valid, valid):
    # evaluate best number of selected features based on cross-validation
    best_score = 0
    best_n = 0
    for n in np.arange(1, data_X.shape[1], round(data_X.shape[1]/20)):
        clf = RandomForestClassifier(n_estimators = 100, n_jobs=-1, random_state = 42, class_weight='balanced')
        scores = list()
        for index in range(len(folds_train)):
            train_ids = folds_train[index]
            valid_ids = folds_valid[index]
            cv_clf = deepcopy(clf)
            train_X, train_y, valid_X, valid_y = data_X[train_ids], data_y[train_ids], data_X[valid_ids], data_y[valid_ids]
            ch2 = SelectKBest(chi2, k=n)
            train_X_chi2 = ch2.fit_transform(train_X, train_y)
            valid_X_chi2 = ch2.transform(valid_X)
            cv_clf.fit(train_X_chi2, train_y)
            score = cv_clf.score(valid_X_chi2, valid_y)
            scores.append(score)
        result = statistics.mean(scores)
        if result > best_score:
            best_n = n
            best_score = result
    #select the 'best_n' features with the highest chi2-statistic and transform the datasets accordingly 
    ch2 = SelectKBest(chi2, k=best_n).fit(data_X, data_y)
    data_CS = ch2.transform(data_X)
    valid_CS = ch2.transform(valid)
    return [data_CS, data_CS.shape[1], valid_CS]

In [None]:
# feature selection using recursive feature elimination with an underlying default random forest model (thesis section 3.5.2)

def RFeatElim(data_X, data_valid, plot = False):
    steps = data_X.shape[1]*0.05 # steps in which features will be recursively removed
    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state = 42, class_weight='balanced') # underlying default random forest model
    rfecv = RFECV(estimator=clf, step=steps, cv=kf, scoring=make_scorer(f1_score, average="weighted")).fit(data_X, dataset_y)
    data_rfe = rfecv.transform(data_X)
    valid_rfe = rfecv.transform(data_valid)
    return [data_rfe, rfecv.n_features_, valid_rfe]

In [None]:
# feature selection using Boruta with an underlying default random forest model (thesis section 3.5.3)

def Boruta(data_X, valid):
    rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state = 42, class_weight='balanced') # underlying default random forest model
    feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42).fit(data_X.todense(), dataset_y)
    data_b = feat_selector.transform(data_X.todense())
    data_v = feat_selector.transform(valid.todense())
    return [data_b, data_b.shape[1], data_v]

In [None]:
# feature selection using random forest selection (thesis section 3.5.4)

def RandForS(data_X, valid):
    clf = RandomForestClassifier(n_estimators = 100, n_jobs=-1, random_state = 42, class_weight='balanced').fit(data_X, dataset_y) # underlying random forest model
    sel = SelectFromModel(clf, prefit = True) # select best features
    data_rfs = sel.transform(data_X)
    valid_rfs = sel.transform(valid)
    return [data_rfs, data_rfs.shape[1], valid_rfs]

In [None]:
# function to evaluate the transformed datasets using a random forest model
def evaluate_data(data_X, data_valid, plot = False):
    clf = RandomForestClassifier(n_estimators = 100, n_jobs=-1, random_state = 42, class_weight='balanced')
    clf.fit(data_X, dataset_y)
    pred = clf.predict(data_valid)
    score = f1_score(validset_y, pred, average = 'weighted', labels=np.unique(validset_y))
    cm = confusion_matrix(validset_y, pred)
    if plot == True:
        print(namestr(data_X))
        plot_cm(cm)
    return score, cm

# function to plot the confusion matrix
def plot_cm(cm, index, alg):
    df_cm = pd.DataFrame(cm, range(len(set(validset_y))), range(len(set(validset_y))))
    df_cm.index.name = 'True'
    df_cm.columns.name = 'Predicted'
    ax = plt.axes()
    sns.set(font_scale=1.4) # for label size
    sns.heatmap(df_cm, annot=True, annot_kws={"size": 10}, ax = ax, fmt='g') # font size
    ax.set_title(str(alg)+ ' (' + str(index)+ ' documents)')
    plt.show()

## 3.1.2 Execution of methods <a id="execution"></a>

In [None]:
#lists for saving the several values (score, time, memory etc.) of the different methods over different data sizes
start_mem = list()

# feature extraction methods
bow_scores = list()
bow_cm = np.zeros((7, 7))
bow_features = list()
bow_time = list()
bow_mem = list()
tfidf1_scores = list()
tfidf1_cm = np.zeros((7, 7))
tfidf1_features = list()
tfidf1_time = list()
tfidf1_mem = list()
tfidf2_scores = list()
tfidf2_cm = np.zeros((7, 7))
tfidf2_features = list()
tfidf2_time = list()
tfidf2_mem = list()
d2v_scores = list()
d2v_cm = np.zeros((7, 7))
d2v_features  = list()
d2v_time = list()
d2v_mem = list()

# feature projection methods
pca_scores = list()
pca_cm = np.zeros((7, 7))  
pca_features = list()
pca_time  = list()
pca_mem = list()
LDAp_scores = list()
LDAp_cm = np.zeros((7, 7))  
LDAp_features = list()
LDAp_time  = list()
LDAp_mem = list()

# feature selection methods
CS_scores = list()
CS_cm = np.zeros((7, 7))  
CS_features = list()
CS_time  = list()
CS_mem = list()
RFE_scores = list()
RFE_cm = np.zeros((7, 7))  
RFE_features = list()
RFE_time  = list()
RFE_mem = list()
boruta_scores = list()
boruta_cm = np.zeros((7, 7))  
boruta_features = list()
boruta_time  = list()
boruta_mem = list()
RFS_scores = list()
RFS_cm = np.zeros((7, 7))  
RFS_features = list()
RFS_time  = list()
RFS_mem = list()


# number of documents:   [100, 200, 500, 1000, 2500, 5191, 10382, 15574, 20765, 25957] -> [100, 200, 500, 1000, 2500] + [round(int(p * len(data))) for p in [0.2*i for i in range(1,6)]]
# number of iterations:  [20   15   10    8     7     6      5     4      3       2]

indexes = [25957] * 2 # for example, evaluating the entire labeled corpus
for index_docs in tqdm_notebook(indexes, total=len(indexes)):
    print(index_docs, 'Dokumente:')
    dataset, validset = stratified_data(index_docs) # select stratified data and validation set
    dataset_y = labels[list(dataset.index.values)] # select respective data labels
    dataset = dataset.reset_index(drop=True)
    validset_y = labels[list(validset.index.values)] # select respective validation labels
    validset = validset.reset_index(drop=True)
    min_df_c = min_df(dataset, 0.05) # compute minimum absolute frequency that a term must occur to be included -> in 5% of the analysed corpus
    folds_train, folds_valid, kf = folds(dataset, dataset_y, 5) # retrieve and save the cross-validation folds to ensure consistency
    mst = %memit -o
    mem_st = get_memory(str(mst)) #starting memory consumption
    start_mem.append(mem_st)
    print('Set-Up complete')
        
    ###############################################################################################################
    # Feature Extraction
    ## BOW
    start = datetime.now(pytz.timezone('Europe/Berlin'))
    mc = %memit -o bow_data, bow_ft, bow_valid, vect_ca = bow('TEXT_PROCESSED')
    end = datetime.now(pytz.timezone('Europe/Berlin'))
    bow_m = get_memory(str(mc))
    bow_mem.append(bow_m)
    bow_t = ((end-start).total_seconds())/60
    bow_time.append(bow_t)
    bow_score, bow_confmat = evaluate_data(bow_data, bow_valid)
    bow_scores.append(bow_score)
    bow_features.append(bow_ft)
    bow_cm += bow_confmat
    
    ## TF-IDF 1
    start = datetime.now(pytz.timezone('Europe/Berlin'))    
    mt1 = %memit -o tfidf1_data, tfidf1_ft, tfidf1_valid, vect_sa = tf_idf_1('TEXT_PROCESSED')
    end = datetime.now(pytz.timezone('Europe/Berlin'))
    tfidf1_m = get_memory(str(mt1))
    tfidf1_mem.append(tfidf1_m)
    tfidf1_t = ((end-start).total_seconds())/60
    tfidf1_time.append(tfidf1_t)
    tfidf1_score, tfidf1_confmat = evaluate_data(tfidf1_data, tfidf1_valid)
    tfidf1_scores.append(tfidf1_score)
    tfidf1_features.append(tfidf1_ft)
    tfidf1_cm += tfidf1_confmat
    
    ### TF-IDF 2
    start = datetime.now(pytz.timezone('Europe/Berlin'))
    pivot_tfidf2 = tf_idf_2_pivot(dataset, 'TEXT_PROCESSED')
    mt2 = %memit -o tfidf2_data, tfidf2_ft, tfidf2_valid, vect_ga = tf_idf_2('TEXT_PROCESSED', pivot_tfidf2, folds_train, folds_valid)
    end = datetime.now(pytz.timezone('Europe/Berlin'))
    tfidf2_m = get_memory(str(mt2))
    tfidf2_mem.append(tfidf2_m)
    tfidf2_t = ((end-start).total_seconds())/60
    tfidf2_time.append(tfidf2_t)
    tfidf2_score, tfidf2_confmat = evaluate_data(tfidf2_data, tfidf2_valid)
    tfidf2_scores.append(tfidf2_score)
    tfidf2_features.append(tfidf2_ft)
    tfidf2_cm += tfidf2_confmat

    ## Doc2Vec
    start = datetime.now(pytz.timezone('Europe/Berlin'))
    mda = %memit -o d2v_data, d2v_f, d2v_valid = d2v('TEXT_PROCESSED')
    end = datetime.now(pytz.timezone('Europe/Berlin'))
    d2v_m = get_memory(str(mda))
    d2v_mem.append(d2v_m)
    d2v_t = ((end-start).total_seconds())/60
    d2v_time.append(d2v_t)
    d2v_score, d2v_confmat = evaluate_data(d2v_data, d2v_valid)
    d2v_scores.append(d2v_score)
    d2v_features.append(d2v_f)
    d2v_cm += d2v_confmat
    
    ###############################################################################################################
    # Feature decomposition
    ## PCA
    start = datetime.now(pytz.timezone('Europe/Berlin'))
    mp = %memit -o data_pca, pca_feature, valid_pca = PrincipalComponent(tfidf1_data, 0.95, tfidf1_valid)
    end = datetime.now(pytz.timezone('Europe/Berlin'))
    pca_m = get_memory(str(mp))
    pca_mem.append(pca_m)
    pca_t = ((end-start).total_seconds())/60
    pca_time.append(pca_t)
    pca_s, pca_confmat = evaluate_data(data_pca, valid_pca)
    pca_scores.append(pca_s)
    pca_features.append(pca_feature)
    pca_cm += pca_confmat
     
    ## LDA (on PCA)
    start = datetime.now(pytz.timezone('Europe/Berlin'))
    mlp = %memit -o data_LDAp, feature_LDAp, valid_LDAp = LinDisAn(data_pca, 0.95, valid_pca)
    end = datetime.now(pytz.timezone('Europe/Berlin'))
    LDAp_m = get_memory(str(mlp))
    LDAp_mem.append(LDAp_m)
    LDAp_t = ((end-start).total_seconds())/60
    LDAp_time.append(LDAp_t)
    LDAp_s, LDAc_confmat = evaluate_data(data_LDAp, valid_LDAp)
    LDAp_scores.append(LDAp_s)
    LDAp_features.append(feature_LDAp)
    LDAp_cm += LDAc_confmat 
    print('Dimensionality Reduction complete')

    ###############################################################################################################
    # Feature selection
    ## Filter methods
    ### Chi-Square Test
    start = datetime.now(pytz.timezone('Europe/Berlin'))
    mcs = %memit -o data_CS, feature_CS, valid_CS = ChiSquare(tfidf1_data, dataset_y, folds_train, folds_valid, tfidf1_valid)
    end = datetime.now(pytz.timezone('Europe/Berlin'))
    CS_m = get_memory(str(mcs))
    CS_mem.append(CS_m)
    CS_t = ((end-start).total_seconds())/60
    CS_time.append(CS_t)
    CS_s, CS_confmat = evaluate_data(data_CS, valid_CS)
    CS_scores.append(CS_s)
    CS_features.append(feature_CS)
    CS_cm += CS_confmat
      
    ## Wrapper methods
    ### Recursive Feature Elimination
    start = datetime.now(pytz.timezone('Europe/Berlin'))
    mrfe = %memit -o data_RFE, feature_RFE, valid_RFE = RFeatElim(tfidf1_data, tfidf1_valid)
    end = datetime.now(pytz.timezone('Europe/Berlin'))
    RFE_m = get_memory(str(mrfe))
    RFE_mem.append(RFE_m)
    RFE_t = ((end-start).total_seconds())/60
    RFE_time.append(RFE_t)
    RFE_s, RFE_confmat = evaluate_data(data_RFE, valid_RFE)
    RFE_scores.append(RFE_s)
    RFE_features.append(feature_RFE)
    RFE_cm += RFE_confmat
    
    ### Boruta
    start = datetime.now(pytz.timezone('Europe/Berlin'))
    mb = %memit -o data_boruta, feature_boruta, valid_boruta = Boruta(tfidf1_data, tfidf1_valid)
    end = datetime.now(pytz.timezone('Europe/Berlin'))
    boruta_m = get_memory(str(mb))
    boruta_mem.append(boruta_m)
    boruta_t = ((end-start).total_seconds())/60
    boruta_time.append(boruta_t)
    boruta_s, boruta_confmat = evaluate_data(data_boruta, valid_boruta)
    boruta_scores.append(boruta_s)
    boruta_features.append(feature_boruta)
    boruta_cm += boruta_confmat
    
    ## Embedded methods
    ### Random Forest Selection
    start = datetime.now(pytz.timezone('Europe/Berlin'))
    mrfs = %memit -o data_RFS, feature_RFS, valid_RFS = RandForS(tfidf1_data, tfidf1_valid)
    end = datetime.now(pytz.timezone('Europe/Berlin'))
    RFS_m = get_memory(str(mrfs))
    RFS_mem.append(RFS_m)
    RFS_t = ((end-start).total_seconds())/60
    RFS_time.append(RFS_t)
    RFS_s, RFS_confmat = evaluate_data(data_RFS, valid_RFS)
    RFS_scores.append(RFS_s)
    RFS_features.append(feature_RFS)
    RFS_cm += RFS_confmat
 
    print('Feature Selection complete')
 
    print("Time:", datetime.now(pytz.timezone('Europe/Berlin')).strftime("%H:%M:%S"))
    print('----------------------------------------------------------------------------------') 

## 3.1.3 Print results of analysis <a id="print"></a>

In [None]:
# print results for feature extraction methods

t_scores = [bow_scores, tfidf1_scores, tfidf2_scores, d2v_scores]
t_features = [bow_features, tfidf1_features, tfidf2_features, d2v_features]
t_times = [bow_time, tfidf1_time, tfidf2_time, d2v_time]
t_memories = [bow_mem, tfidf1_mem, tfidf2_mem, d2v_mem]
confusion_matrixes = [bow_cm, tfidf1_cm, tfidf2_cm, d2v_cm]

print(indexes,'\n')
print('Starting memory:', start_mem,'\n')
for i in range(len(t_scores)):
    print(namestr(t_scores[i])[0])
    print('Score:', t_scores[i])
    print('Features:', t_features[i])
    print('Time (min):', t_times[i])
    print('Memories (Mebibyte; MiB)):', t_memories[i])
    print('CMs :', confusion_matrixes[i])
    plot_cm(confusion_matrixes[i], indexes[0], namestr(t_scores[i])[0])
    print()
print('----------------------') 

In [None]:
# print results for feature decomposition methods

t_scores_2 = [pca_scores, LDAp_scores, CS_scores, RFE_scores, boruta_scores, RFS_scores]
t_features_2 = [pca_features, LDAp_features, CS_features, RFE_features, boruta_features, RFS_features]
t_times_2 = [pca_time, LDAp_time, CS_time, RFE_time, boruta_time, RFS_time]
t_memories_2 = [pca_mem, LDAp_mem, CS_mem, RFE_mem, boruta_mem, RFS_mem]
confusion_matrixes_2 = [pca_cm, LDAp_cm, CS_cm, RFE_cm, boruta_cm, RFS_cm]
for i in range(len(t_scores_2)):
    print(namestr(t_scores_2[i])[0])
    print('Score:', t_scores_2[i])
    print('Features:', t_features_2[i])
    print('Time (min):', t_times_2[i])
    print('Memories (Mebibyte; MiB)):', t_memories_2[i])
    print('CMs :', confusion_matrixes_2[i]) # 
    plot_cm(confusion_matrixes_2[i], indexes[0], namestr(t_scores_2[i])[0])
    print()

# 3.2 Ensemble feature selection technique <a id="ensemble"></a>

## 2.2.1 TF-IDF 1 transformation <a id="extract"></a>

In [None]:
#Transform text into TF-IDF representation
labeled_data = data['TEXT_PROCESSED'] #select text
unlabeled_data = data_unlabeled['TEXT_PROCESSED'] #select text
min_df_c = min_df(labeled_data, 0.05) #compute minimum frequency for term to be included
tfidf = TfidfVectorizer(analyzer= 'word', ngram_range = (1, 2), min_df = min_df_c, max_features = None, norm = 'l2', smooth_idf = True, sublinear_tf =True) # initiate uni- and bigram Tf-Idf vectorizer 
tfidf_vectorizer_best = tfidf.fit(labeled_data) #fit vectorizer on labeled dataset
labeled_tfidf = tfidf_vectorizer_best.transform(labeled_data) #transform labeled dataset
unlabled_tfidf = tfidf_vectorizer_best.transform(unlabeled_data) #transform unlabeled dataset
features = len(tfidf_vectorizer_best.get_feature_names()) #get number of features

## 3.2.2 Ensemble Feature Selection based on Boruta & Rankings  <a id="selection"></a>

In [None]:
# Boruta feature selection
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state = 42, class_weight='balanced') # underlying default random forest
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42).fit(labeled_tfidf.todense(), labels) # Boruta selection

In [None]:
# Chi-Square feature selection 
ch2, ch_p = chi2(labeled_tfidf, labels) #calculate chi square statistics for features
rnk_cs = rankdata([-1 * i for i in ch2]).astype(int) # rank features according to their importance assessed by chi square test

In [None]:
# Recursive Feature Elimination (RFE)
steps = labeled_tfidf.shape[1]*0.05 # determine number of removed feature per iteration
kf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 42) # ensure stratified cross validation for RFE
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state = 42, class_weight='balanced') # base RFE on random forest model
rfecv = RFECV(estimator=clf, step=steps, cv=kf, scoring=make_scorer(f1_score, average="weighted")).fit(labeled_tfidf, labels) # fit RFE on labeled data
output_rfe = rfecv.ranking_ # rank features according to their importance assessed by RFE

In [None]:
# Random Forest Selection (RFS)
rfs = RandomForestClassifier(n_estimators = 100, n_jobs=-1, random_state = 42, class_weight='balanced').fit(labeled_tfidf, labels) # fit RFSE on labeled data
imp_rfs = rfs.feature_importances_ # feature importances evaluted by RFS
rnk_rfs = rankdata([-1 * i for i in imp_rfs]).astype(int) # rank features according to their importance assessed by RFS

### Combination of feature selection methods

In [None]:
# Compute average rankings (not including the boruta rankings, because their selected features present the basis)
all_np = np.array([np.array(rnk_cs), np.array(output_rfe), np.array(rnk_rfs)]) # combine all rankings in one array
all_np_avg = list(np.average(all_np, axis=0)) # take average of the 3 rankings

In [None]:
# build labaled and unlabeled dataframes for later filtering 
df_data = pd.DataFrame(labeled_tfidf.todense(), columns = range(labeled_tfidf.shape[1]))
df_uldata = pd.DataFrame(unlabled_tfidf.todense(), columns = range(unlabled_tfidf.shape[1]))

# check for shapes of datasets
print(df_data.shape)
print(df_uldata.shape)

In [None]:
# select features based on their ranks

bor_data = feat_selector.transform(labeled_tfidf) # labeled dataset containing only the Boruta-selected features
b = deepcopy(feat_selector.support_) # Array of selected features: only confirmed ones are True
a = np.ma.array(all_np_avg, mask=b) # Array of the average ranks, where the selected Boruta features are masked

for i in range(bor_data_df.shape[1]): #loop as often as the number of Boruta features
    new_feature = np.argmin(a) #select index of lowest rank
    b[new_feature] = True #change entry of selected feature to "True" to include it 
    a = np.ma.array(all_np_avg, mask=b) #update masked array for next round

# print controlling number of features 
print('Desired number of features (dobule boruta features):', bor_data.shape[1]*2)
print('Counter of final array with selected features:', dict(Counter(b)))
print('Number of features if filtered by the selected ones:', len(b[b]))

In [None]:
# filter datasets for the selected features
labeled_X_final = csc_matrix(df_data[df_data.columns[b]])
unlabeled_X_final = csc_matrix(df_uldata[df_uldata.columns[b]])

# check for correct transformations of datasets
print(labeled_X_final.shape)
print(unlabeled_X_final.shape)

## 3.2.3 Saving transformed datasets as pickles <a id="pickles"></a>

In [None]:
save_pickle(labeled_X_final, 'Pickles/2_labeled_X_selected.pickle')
save_pickle(unlabeled_X_final, 'Pickles/2_unlabeled_X_selected.pickle')