In [None]:
from libtlda.scl import StructuralCorrespondenceClassifier
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score, classification_report, roc_auc_score,matthews_corrcoef, precision_recall_curve, fbeta_score, make_scorer,mean_absolute_error,accuracy_score,jaccard_score
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split,StratifiedKFold,RandomizedSearchCV,GridSearchCV,cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.linear_model import LogisticRegression
from scipy.optimize import minimize
import numpy as np
from sklearn import svm
import numpy as np
import scipy.stats as st
from scipy.sparse import linalg
from sklearn.linear_model import LogisticRegression
from os.path import basename
from sklearn.feature_selection import mutual_info_classif

class StructuralCorrespondenceClassifier(object):
    """
    Class of classifiers based on structural correspondence learning.
    Methods consist of a way to augment features, and a Huber loss function
    plus gradient.
    """

    def __init__(self, loss='logistic', l2=1.0, num_pivots=1,
                 num_components=1,MI_selection='freq',freq_threshold=100):
        """
        Select a particular type of importance-weighted classifier.
        Parameters
        ----------
        loss : str
            loss function for weighted classifier, options: 'logistic',
                'quadratic', 'hinge' (def: 'logistic')
        l2 : float
            l2-regularization parameter value (def:0.01)
        num_pivots : int
            number of pivot features to use (def: 1)
        num_components : int
            number of components to use after extracting pivot features
            (def: 1)
        Returns
        -------
        None
        """
        self.loss = loss
        self.l2 = l2
        self.num_pivots = num_pivots
        self.num_components = num_components
        self.MI_selection=MI_selection
        self.freq_threshold=freq_threshold
        self.source_dataset_dict={}
        self.target_dataset_dict={}
        
        # Initialize untrained classifiers based on choice of loss function
        if self.loss == 'logistic':
            # Logistic regression model
            self.clf = LogisticRegression()
        elif self.loss == 'rf':
            # Least-squares model
            self.clf = RandomForestClassifier()
        elif self.loss == 'svm':
            # Linear support vector machine
            self.clf = SVC(probability=True)
        else:
            # Other loss functions are not implemented
            raise NotImplementedError('Loss not implemented yet.')

        # Whether model has been trained
        self.is_trained = False

        # Maintain pivot component matrix
        self.C = 0

        # Dimensionality of training data
        self.train_data_dim = ''
        
        # Parameter grids of algorithms
        self.svm_grid = {'C': [.01,.1,.4,.6,1,1.5],
              'kernel': ['rbf','poly','linear'],
              'gamma':  [.01, .1, 1],
              'degree': [1, 2, 3, 4, 5],
              'probability': [True],
              'class_weight':['balanced']
             }

        self.rf_grid={'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
                       'max_features': ['auto', 'sqrt'],
                       'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
                       'min_samples_split': [2, 5, 10],
                       'min_samples_leaf': [1, 2, 4],
                       'bootstrap': [True, False],
                        'class_weight':['balanced']}

    def augment_features_exp(self, X,y, Z,y_z ,l2=0.0):
        
        """
        Find a set of pivot features, train predictors and extract bases.
        Parameters
        X : array
            source data array (N samples by D features)
        Z : array
            target data array (M samples by D features)
        l2 : float
            regularization parameter value (def: 0.0)
        Returns
        -------
        None
        """
        
        # create word frequency dictionaries
        self.source_dataset_dict=self.create_dataset_dict(X)
        self.target_dataset_dict=self.create_dataset_dict(Z)
        
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape
   
        

        # merge the datasets
        columns_conc=pd.concat([X, Z]).columns
        XZ=pd.concat([X, Z]).fillna(0).to_numpy()
    
        # shape of dataset
        K, DXZ = XZ.shape

        #select pivots on Mutual Information (MI) in source domain
        if self.MI_selection=='mi':
            ix=self.select_on_MI(X, y, self.num_pivots)
            
        #select pivots on Mutual Information (MI) in source domain and target domain
        elif self.MI_selection=='mimi':
            ix=self.select_on_MIMI(X,y,Z, y_z, self.num_pivots)
        
        #select pivots on frequency
        elif self.MI_selection=='freq':
            # Sort indices based on frequency of features (assumes BoW encoding)
            ix = np.argsort(np.sum(XZ, axis=0))
            # Keep most frequent features
            ix = ix[::-1][:self.num_pivots]
            
        # Slice out pivot features and relabel them as present(=1)/absent(=0)
        pivot = (XZ[:, ix] > 0).astype('float') 
       
        # Solve prediction tasks with a Huber loss function
        P = np.zeros((DXZ, self.num_pivots))  

        # Loop over pivot features
        for l in range(self.num_pivots):

            # Setup loss function for single pivot
            def L(theta): return self.Huber_loss(theta, XZ, pivot[:, l])

            # Setup gradient function for single pivot
            def J(theta): return self.Huber_grad(theta, XZ, pivot[:, l])

            # Make pivot predictor with a Huber loss function
            results = minimize(L, np.random.randn(DXZ, 1), jac=J, method='BFGS',   #DX hier veranderd naar DXZ
                               options={'gtol': 1e-6, 'disp': True})

            # Store optimal parameters
            P[:, l] = results.x
       
        
        
        # Compute covariance matrix of predictors  
        SP = np.cov(P)  
       
        # Add regularization to ensure positive-definiteness
        test=l2*np.eye(DXZ)
        
        SP += test       
        
        # Eigenvalue decomposition of pivot predictor matrix
        V, C = np.linalg.eigh(SP)
    
        # Reduce number of components
        C = C[:, :self.num_components]
    
        #Dimensionality correction
        intersect_cols=[]
        for column in set(X.columns).intersection(Z.columns):
            intersect_cols.append(X.columns.get_loc(column))
        C_z=C[intersect_cols,:]
        rest_cols=len(Z.columns)-len(intersect_cols)
        C_zz=C[len(X.columns):(len(X.columns)+rest_cols+1),:]
        C_z=np.concatenate([C_z,C_zz],axis=0)
            
        # Augment features
        Xa=pd.DataFrame(data=np.concatenate((np.dot(X, C[0:len(X.columns),:]), X), axis=1))
        Za=pd.DataFrame(data=np.concatenate((np.dot(Z, C_z), Z), axis=1))
 
        return Xa, Za, C  
    
    
    def Huber_loss(self, theta, X, y, l2=0.0):
        """
        Huber loss function.
        Reference: Ando & Zhang (2005a). A framework for learning predictive
        structures from multiple tasks and unlabeled data. JMLR.
        Parameters
        ----------
        theta : array
            classifier parameters (D features by 1)
        X : array
            data (N samples by D features)
        y : array
            label vector (N samples by 1)
        l2 : float
            l2-regularization parameter (def= 0.0)
        Returns
        -------
        array
            Objective function value.
        """
        # Precompute terms
        Xy = (X.T*y.T).T
        Xyt = np.dot(Xy, theta)

        # Indices of discontinuity
        ix = (Xyt >= -1)

        # Loss function
        return np.sum(np.clip(1 - Xyt[ix], 0, None)**2, axis=0) \
            + np.sum(-4*Xyt[~ix], axis=0) + l2*np.sum(theta**2, axis=0)

    def Huber_grad(self, theta, X, y, l2=0.0):
        """
        Huber gradient computation.
        Reference: Ando & Zhang (2005a). A framework for learning predictive
        structures from multiple tasks and unlabeled data. JMLR.
        Parameters
        ----------
        theta : array
            classifier parameters (D features by 1)
        X : array
            data (N samples by D features)
        y : array
            label vector (N samples by 1)
        l2 : float
            l2-regularization parameter (def= 0.0)
        Returns
        -------
        array
            Gradient with respect to classifier parameters
        """
        # Precompute terms
        Xy = (X.T*y.T).T
        Xyt = np.dot(Xy, theta)

        # Indices of discontinuity
        ix = (Xyt >= -1)

        # Gradient
        return np.sum(2*np.clip(1-Xyt[ix], 0, None).T * -Xy[ix, :].T,
                      axis=1).T + np.sum(-4*Xy[~ix, :], axis=0) + 2*l2*theta
    
    def fit_nieuw(self, X, y, Z,y_t):
        """
        Fit/train an structural correpondence classifier.
        Parameters
        ----------
        X : array
            source data (N samples by D features)
        y : array
            source labels (N samples by 1)
        Z : array
            target data (M samples by D features)
        y_t: array
            source labels (N samples by 1)
        
        Returns
        -------
        target_augmented: array
                          augmented target data (N samples by D + self.num_components features)
        random_search:    object
                          fitted RandomizedSearchCV object with optimal hyperparameters
        t                 double
                          optimal decision threshold based on 10% target data
        y_train           array
                          source labels (K samples by 1)
        precision         double
                          precision score on test data
        recall            double
                          recall score on test data
                          
        """
        
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape


        # Augment features
        X, target_augmented, self.C = self.augment_features_exp(X,y, Z,y_t, l2=self.l2)
        
        
        # Initialize parameter grid based on algorithm
        if self.loss=='rf':
            grid=self.rf_grid
        elif self.loss=='svm':
            grid=self.svm_grid
            
        
        # Create scoring function to evaluate performance on F2-score
        ftwo_scorer = make_scorer(fbeta_score, beta=2)
        
        # Initialize random search object with f2 scoring
        random_search = RandomizedSearchCV(estimator=self.clf,
                                   param_distributions=grid,
                                   n_iter=12,
                                   scoring=ftwo_scorer,
                                   cv=3, 
                                   verbose=1, 
                                   refit=True)
       
        
       
        # Mark classifier as trained
        self.is_trained = True

        # Store training data dimensionality
        self.train_data_dim = DX + self.num_components
        
        #split train/test to enable thresholding on target domain
        target_augmented, target_augmented_test, y_train, y_test=self.split_train_test(target_augmented,y_t)
        sample_weight = compute_sample_weight(class_weight='balanced', y=y)
        
        # Run random search algorithm
        random_search.fit(X,y,sample_weight=sample_weight)
        
        # Find optimal decision threshold
        t,precision,recall=self.thresholden(random_search,y_test,target_augmented_test)
        
        return target_augmented,random_search,t,y_train,precision,recall
        
    
    def split_train_test(self,X_train,y_train):
        """
        This function splits the data in train and test set
        """
        
        X_train, X_test, y_train, y_test = train_test_split(X_train,  
                                                       y_train, 
                                                        test_size=0.15,stratify=y_train)
        return X_train, X_test, y_train, y_test
    
    
    
    def adjusted_classes(self,y_scores, t):
        """
        This function adjusts class predictions based on the prediction threshold (t).
        """
        return [1 if y >= t else 0 for y in y_scores]
    
    def predict(self, Z):
        """
        Make predictions on new dataset.
        Parameters
        ----------
        Z : array
            new data set (M samples by D features)
        Returns
        -------
        preds : array
            label predictions (M samples by 1)
        """
        # Data shape
        M, D = Z.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            if not self.train_data_dim == D:
                raise ValueError('''Test data is of different dimensionality
                                 than training data.''')

        # Check for augmentation
        if not self.train_data_dim == D:
            Z = np.concatenate((np.dot(Z, self.C), Z), axis=1)

        # Call scikit's predict function
        preds = self.clf.predict(Z)
        
        # For quadratic loss function, correct predictions
        if self.loss == 'quadratic':
            preds = (np.sign(preds)+1)/2.

        # Return predictions array
        return preds
    
    def predict_proba(self, Z):
        """
        Makes probability predictions on new dataset.
        Parameters
        ----------
        Z : array
            new data set (M samples by D features)
        Returns
        -------
        preds : array
            label predictions (M samples by 1)
        """
        # Data shape
        M, D = Z.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            if not self.train_data_dim == D:
                raise ValueError('''Test data is of different dimensionality
                                 than training data.''')

        # Check for augmentation
        if not self.train_data_dim == D:
            Z = np.concatenate((np.dot(Z, self.C), Z), axis=1)

        # Call scikit's predict_proba function
        probas = self.clf.predict_proba(Z)
        
      
        # Return predictions array
        return probas
    
    def get_params(self):
        """Get classifier parameters."""
        return self.clf.get_params()
    
    def create_dataset_dict(self,dataset):
        """
        This function creates a dictionary with the word frequencies
        ----------
        Parameters
        ----------
        dataset : array
                  data set (M samples by D features)
        """
        dict_dataset={}
        for feature in dataset.columns:
            dict_dataset[feature]=dataset[feature].sum()
        return dict_dataset
    
    def thresholden(self,random_search,y_val,X_val):
        """
        This function finds the optimal decision threshold t for the classifier.
        """
        # generate probabilities
        y_probas=random_search.predict_proba(X_val)
        
        # generate scores based on different thresholds
        p, r, thresholds = precision_recall_curve(y_val, y_probas[:,1])
        f2_list=[]
        for i in range(0,len(p)):
            f2_list.append((5*p[i]*r[i])/((4*p[i])+r[i]))
        
        # find maximum f2 score generated by all possible decision thresholds
        t=thresholds[f2_list.index(max(f2_list))]
        return t,p,r

    def is_trained(self):
        """Check whether classifier is trained."""
        return self.is_trained
    
    def select_on_MI(self,X,y,num_pivots):
         """
        This function selects the features with the highest MI and which frequencies are above a threshold
        ----------
        Parameters
        ----------
        X : array
            source data (N samples by D features)
        y : array
            source labels (N samples by 1)
        num_pivots : int
                     number of desired pivot features
        """
            
        # select possible pivot feature candidates on frequency
        features_freq=[feature for feature in X.columns if self.source_dataset_dict[feature]>self.freq_threshold and feature in self.target_dataset_dict and self.target_dataset_dict[feature]>self.freq_threshold]
        
        # sort the candidates on their mutual information
        sorted_MIs=np.argsort(mutual_info_classif(X[features_freq],y , discrete_features=True))
        
        return sorted_MIs[-num_pivots:]
    
    def select_on_MIMI(self,X,y,Z,y_t,num_pivots):   #functie krijgt 10% target data mee
        """""
        This function selects the features with the highest MI and which frequencies are above a threshold
         ----------
         Parameters
         ----------
        X : array
            source data (N samples by D features)
        y : array
            source labels (N samples by 1)
        Z : array
            target data (M samples by D features)
        y_t: array
            source labels (N samples by 1)
        num_pivots : int
                     number of desired pivot features
        """""
        
        # select possible pivot feature candidates on frequency
        features_freq=[feature for feature in X.columns if self.source_dataset_dict[feature]>self.freq_threshold and feature in self.target_dataset_dict and self.target_dataset_dict[feature]>self.freq_threshold]
        
        # select 10% target domain data for MI selection 
        Z_rest, Z_MI, Z_y_rest, Z_y_MI=self.split_train_test(Z,y_t)
        
        # MI scores in source domain
        MI_scores_Z=mutual_info_classif(Z_MI[features_freq],Z_y_MI , discrete_features=True)
        
        # MI scores in target domain
        MI_scores_X=mutual_info_classif(X[features_freq],y , discrete_features=True)
        
        # Combine MI scores
        MI_scores_comb=MI_scores_Z+MI_scores_X
        
        # Sort combined MI scores
        sorted_MIs=np.argsort(MI_scores_comb)
        
        return sorted_MIs[-num_pivots:]
        
    

def undersample(features_train,labels_train):
    """""
        This function undersamples the majority class
         ----------
         Parameters
         ----------
        features_train : array
            source data (N samples by D features)
        labels_train : array
            source labels (N samples by 1)
        """""

    features_train, labels_train =RandomUnderSampler(sampling_strategy=1).fit_resample(features_train,labels_train)
    
    return features_train,labels_train



def oversample(features_train,labels_train):
    """""
        This function undersamples the minority class
         ----------
         Parameters
         ----------
        features_train : array
            source data (N samples by D features)
        labels_train : array
            source labels (N samples by 1)
        """""
    features_train, labels_train =RandomOverSampler(sampling_strategy=0.5).fit_resample(features_train,labels_train)
    return features_train,labels_train

def create_balanced_sets(dfsource,source_labels,dftarget,target_labels):
     """""
        This function creates balanced datasets for both domains
         ----------
         Parameters
         ----------
        dfsource : array
            source data (N samples by D features)
        source_labels : array
            source labels (N samples by 1)
        dftarget : array
            source data (M samples by D features)
        target_labels : array
            source labels (M samples by 1)
        """""
    
    dfsource,source_labels=undersample(dfsource,source_labels)
    dftarget,target_labels=undersample(dftarget,target_labels)
    return dfsource,source_labels,dftarget,target_labels

def init_lists():
    """""
        This function initializes empty scoring lists
    """"
    lists=[[] for i in range(8)]
    return lists[0],lists[1],lists[2],lists[3],lists[4],lists[5],lists[6],lists[7]

def update_lists(roc_auc_list,roc_auc,f2_score_before_thresholding_list,f2_score_before_thresholding,fpr_list,fpr,fnr_list,fnr,f2_score_list_after,f2_at,fpr_list_after,fpr_at,fnr_list_after,fnr_at,acc_list,acc):
    """""
        This function updates the lists with individual metrics
    """"
    roc_auc_list.append(roc_auc)
    f2_score_before_thresholding_list.append(f2_score_before_thresholding)
    fpr_list.append(fpr)
    fnr_list.append(fnr)
    f2_score_list_after.append(f2_at)
    fpr_list_after.append(fpr_at)
    fnr_list_after.append(fnr_at)
    acc_list.append(acc)
    return roc_auc_list,f2_score_before_thresholding_list,fpr_list,fnr_list,f2_score_list_after,fpr_list_after,fnr_list_after,acc_list

def calc_and_print_scores(y_test,y_pred):
    """""
        This function calculates the important metrics from the predictions and ground truth
         ----------
         Parameters
         ----------
        y_test : array
            source labels (N samples by 1)
        y_pred : array
            predictions (N samples by 1)
    """"
    f2=fbeta_score(y_test,y_pred,beta=2)
    tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
    fnr=fn/(fn+tp)
    fpr=fp/(fp+tn)
    
    return f2,fnr,fpr

def perform_countvect(X_train,X_test,ngram_range,analyzer,num_features):
    """""
    This function calculates the important metrics from the predictions and ground truth
     ----------
     Parameters
     ----------
    X_train : array
              source features (N samples by D features)
    X_test : array
             source features (M samples by D features)
    ngram_range : tuple  (min_n, max_n)
                  The lower and upper boundary of the range of n-values for 
                  different word n-grams or char n-grams to be extracted.
    analyzer : string {‘word’, ‘char’, ‘char_wb’} 
               Whether the feature should be made of word n-gram or character n-grams.
    num_features : int 
                   K features to select by the vectorizer
        
    """"
    X_train,X_test=X_train.astype(str),X_test.astype(str)
    
    min_df = 10
    max_df = 1.
    max_features = num_features
    countvect = CountVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,analyzer=analyzer,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                               binary=True)
    features_train = countvect.fit_transform(X_train).toarray()
    features_train = pd.DataFrame(features_train, columns=countvect.get_feature_names())
    features_test = countvect.transform(X_test).toarray()
    features_test = pd.DataFrame(features_test, columns=countvect.get_feature_names())
    return features_train,features_test

def extract_ngram(text_representation):
      """""
    This function sets the tuple correct for the text representation
     ----------
     Parameters
     ----------
    text_representation : string {'UNIGRAM','BIGRAM','TRIGRAM'}
              source features (N samples by D features)
    
    """"
    if text_representation=='UNIGRAM':
        ngram_range=(1,1)
    elif text_representation=='BIGRAM':
        ngram_range=(1,2)
    elif text_representation=='TRIGRAM':
        ngram_range=(1,3)
    return ngram_range

