In [5]:
import pandas as pd 
import numpy as np 
import sklearn.feature_extraction.text as ft 
from sklearn.preprocessing import minmax_scale
import nltk
import sys 
from tqdm.notebook import tqdm
from scipy.stats import entropy
from sklearn.model_selection import train_test_split, cross_val_score
from scipy.sparse import csr_matrix, csc_matrix
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score
from sklearn.base import clone

In [2]:
sys.path.insert(0, '../')
from src.preprocessing.ctfidf import CTFIDFVectorizer
import src.preprocessing.text_preprocessing as tp
import src.preprocessing.feature_extraction.text.filtering as filter
%load_ext autoreload
%autoreload 2
tqdm.pandas()

In [3]:
df = pd.read_csv('../data/enron/enron_spam_data.csv', sep=',')
df = df.fillna('')
df = df.astype('str')
df = df.sample(n=1000)
df['Text'] = df.apply(lambda x: x['Subject'] + ', ' + x['Message'], axis=1)
df['Label'] = np.where(df['Spam/Ham'].values == 'ham', 0, 1)
df['Text'] = df['Text'].progress_apply(tp.normalize_text)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [4]:
count_vectorizer = ft.CountVectorizer()
count_vectorizer.fit(df['Text'])
counts = count_vectorizer.transform(df['Text'])
vocabulary = count_vectorizer.get_feature_names_out()
labels = df['Label']

In [8]:
class CTFIDFFeatureExtractor(filter.BaseTextFeatureExtractor):
    """
    Calculate feature strength according to ctfidf.
    """
    def __init__(self):
        self.feature_strength_metric = None 
    
    def fit(self, df):
        text_per_class = df.groupby(['Label'], as_index=False).agg({'Text': ' '.join})
        count_vectorizer = ft.CountVectorizer().fit(text_per_class['Text'])
        count = count_vectorizer.transform(text_per_class['Text'])

        ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=df.shape[0]).toarray()
        labels = text_per_class['Label'].unique()

        feature_strength = []
        for label in labels:
            feature_strength.append(ctfidf[label])
        self.feature_strength_metric = np.squeeze(np.maximum.reduce(feature_strength))

    def transform(self, X):
        """
        Transforms input to corresponding feature importance values
        """
        X_t = X.copy()
        feature_st_matrix = np.tile(self.feature_strength_metric, (X.shape[0], 1))
        X_t = np.copyto(X_t, feature_st_matrix, where=X_t != 0)
        
        return X_t 



In [9]:
extractor = CTFIDFFeatureExtractor()
extractor.fit(df)

In [17]:
X, voc = extractor.filter_n_best(counts, 100, vocabulary)