In [1]:
import pandas as pd 
import numpy as np 
import sklearn.feature_extraction.text as ft 
from sklearn.preprocessing import minmax_scale
import nltk
import sys 
from tqdm.notebook import tqdm
from scipy.stats import entropy
from sklearn.model_selection import train_test_split, cross_val_score
from scipy.sparse import csr_matrix, csc_matrix
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.base import clone
import sklearn 

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

from timeit import default_timer as timer
from datetime import timedelta
import logging


In [2]:
sys.path.insert(0, '../')
from src.preprocessing.ctfidf import CTFIDFVectorizer
import src.preprocessing.text_preprocessing as tp
import src.preprocessing.feature_extraction.text.filtering as filter
import src.preprocessing.feature_extraction.text.wrapping as wrapping
%load_ext autoreload
%autoreload 2
tqdm.pandas()

In [3]:
def record_results(model: sklearn.base.BaseEstimator, X_t, y_t, X_val, y_val, timing: dict) -> dict:
    """
    Evaluates the passed model both on test and train set and returns a dict with evaluation results.
    Arguments:
        model - trained sklearn model to evaluate
        X_t - training features
        y_t - training labels
        X_val - testing features
        y_val - testing labels
        timing - dict with timings to add inference timings to
    Retruns:
        results - dict with all evaluation results
    """

    predictions_train = model.predict(X_t)

    start = timer()
    predictions_test = model.predict(X_val)
    end = timer()
    timing['model_inference_time'] = str(timedelta(seconds=end-start))
    logging.info('Model inference finished.')

    report_train = classification_report(y_t, predictions_train, output_dict=True)
    report_test = classification_report(y_val, predictions_test, output_dict=True)

    cm_train = confusion_matrix(y_t, predictions_train, normalize='true')
    cm_test = confusion_matrix(y_val, predictions_test, normalize='true')

    results = {}
    results['timing'] = timing
    results['training_data_samples'] = X_t.shape[0]
    results['test_data_samples'] = X_val.shape[0]
    results['classification_report_train'] = report_train
    results['classification_report_test'] = report_test
    results['confustion_matrix_train'] = cm_train.tolist()
    results['confusion_matrix_test'] = cm_test.tolist()
    results['model_type'] = type(model).__name__
    results['model_params'] = model.get_params()

    return results

def test_extractor(model: sklearn.base.BaseEstimator, extractor: filter.BaseTextFeatureExtractor, df: pd.DataFrame, split: float, n_words: int) -> dict:
    """
    Train passed model on a features selected by passed extractor.
    """
    timing = {}
    X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Label'], test_size=split, stratify=df['Label'])

    count_vectorizer = CountVectorizer(binary=True)
    count_vectorizer.fit(X_train)

    X_train_vectorized = count_vectorizer.transform(X_train)
    X_test_vectorized = count_vectorizer.transform(X_test)

    start = timer()
    extractor.fit(X_train_vectorized, y_train)
    end = timer()
    timing['extractor_fit'] = str(timedelta(seconds=end-start))
    logging.info('Fit extractor.')
    
    start = timer()
    vocabulary = count_vectorizer.get_feature_names_out()
    X_train_vectorized_filtered, vocabulary_filtered = extractor.filter_n_best(X_train_vectorized, n_words, vocabulary)
    X_test_vectorized_filtered, _ = extractor.filter_n_best(X_test_vectorized, n_words, vocabulary)
    end = timer()
    timing['filtered_features'] = str(timedelta(seconds=end-start))


    tfidf = TfidfTransformer()
    X_train_vectorized_filtered = tfidf.fit_transform(X_train_vectorized_filtered, y_train)
    X_test_vectorized_filtered = tfidf.transform(X_test_vectorized_filtered)

    start = timer()
    model.fit(X_train_vectorized_filtered, y_train)
    end = timer()    
    timing['model_training_time'] = str(timedelta(seconds=end-start))
    logging.info('Model training finished.')

    results = record_results(model=model, 
                                X_t=X_train_vectorized_filtered,
                                y_t=y_train,
                                X_val=X_test_vectorized_filtered,
                                y_val=y_test,
                                timing=timing)
    results['n_words'] = n_words
    results['selected_vocabulary'] = vocabulary_filtered.tolist()

    return results

In [4]:
df = pd.read_csv('../data/brown_corpus/brown_corpus.csv', sep=';')
df = df.fillna('')
df = df.astype('str')

# df = df.loc[~df['Label'].isin(['humor', 'religion', 'reviews', 'science_fiction'])]
df['Label'] = df['Label'].astype('category')
df['Label'] = df['Label'].cat.codes
df['Text'] = df['Text'].progress_apply(tp.normalize_text)

  0%|          | 0/5958 [00:00<?, ?it/s]

In [196]:
extractor = filter.TermStrengthFeatureExtractor()
model = OneVsRestClassifier(SVC(class_weight='balanced', kernel='rbf', gamma=1/10))
# results = test_extractor(model, extractor, df, 0.3, n_words=10000)

In [199]:

timing = {}
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Label'], test_size=0.3)

count_vectorizer = CountVectorizer(binary=True)
count_vectorizer.fit(X_train)

X_train_vectorized = count_vectorizer.transform(X_train)
X_test_vectorized = count_vectorizer.transform(X_test)

vocabulary = count_vectorizer.get_feature_names_out()
extractor = wrapping.ShapFeatureExtractor(vocabulary=vocabulary)

start = timer()
extractor.fit(X_train_vectorized, y_train)
end = timer()
timing['extractor_fit'] = str(timedelta(seconds=end-start))
logging.info('Fit extractor.')

# start = timer()
# X_train_vectorized_filtered, vocabulary_filtered = extractor.filter_n_best(X_train_vectorized, n_words)
# X_test_vectorized_filtered, _ = extractor.filter_n_best(X_test_vectorized, n_words)
# end = timer()
# timing['filtered_features'] = str(timedelta(seconds=end-start))

# start = timer()
# model.fit(X_train_vectorized_filtered, y_train)
# end = timer()    
# timing['model_training_time'] = str(timedelta(seconds=end-start))
# logging.info('Model training finished.')

# results = record_results(model=model, 
#                             X_t=X_train_vectorized_filtered,
#                             y_t=y_train,
#                             X_val=X_test_vectorized_filtered,
#                             y_val=y_test,
#                             timing=timing)
# results['n_words'] = n_words
# results['selected_vocabulary'] = vocabulary_filtered.tolist()

# return results


In [201]:
extractor.feature_strength_metric.shape

(1251, 30713)

In [207]:
shap_values = extractor.shap_values

In [214]:
t_l = []
for cls in shap_values:
    t_l.append(np.mean(np.abs(cls), axis=0))
arr = np.vstack(t_l)


In [216]:
t = np.maximum.reduce(arr)

In [219]:
np.min(t)

0.0