In [45]:
from sklearn.model_selection import train_test_split

class ProxyAlphaDistance(object):
    """
    Class of functions that together can compute the proxy-alpha distance between two domains. 
    The calculation is based on the error score of a simple Support Vector Classifier and the Huber Loss. 
    """

    def __init__(self,df1,df2):
        """
        Initialize variables.
        ----------
        Parameters
        ----------
        df1 : dataframe object
             dataframe (BoW encoding assumed) of the first domain (N samples by D features)
        df2 : dataframe object
             dataframe (BoW encoding assumed) of the second domain (M samples by E features)
        Returns
        -------
        None
        """
        b_s=np.min([len(df1),len(df2)])
        self.df1=df1.sample(n=b_s)
        self.df2=df2.sample(n=b_s)
        self.y_pred=[]
        self.y_probas=[]
        self.estimator=svm.SVC(C=3000, probability=True, verbose=2,kernel='linear')
        self.df_merged=pd.DataFrame()
    
    def merge_and_split(self):
        """
        Merge the two domains into one big dataset and then split it up into mixed train and test sets
        ----------
        Returns
        -------
        X_train: array 
                features of training set (K samples by D features)
        X_test: array 
                features of training set (L samples by D features)
        y_train: array
                 training labels (K samples by 1)
        y_test: array
                 test labels (L samples by 1)
        """
        #add 'source' domain column to datasets as their label
        self.df1['Source domain']=0
        self.df2['Source domain']=1
        
        #merge datasets
        self.df_merged=pd.concat([self.df1,self.df2],join='inner',sort=False).fillna(0).reset_index(drop=True)
        
        #Split into train and test data
        X_train, X_test, y_train, y_test = train_test_split(self.df_merged.drop('Source domain',axis=1),self.df_merged['Source domain'],test_size=0.15,shuffle=True)
        
        return X_train, X_test, y_train, y_test 
        
    def huber_loss(self,p, y):
         """
        Compute huber loss between ground truth and probabilities
        ----------
        Returns
        -------
        huber loss: double
        """"
        z = p * y
        if z >= 1.0:
            return 0.0
        elif z >= -1.0:
            return (1 - z) * (1 - z)
        else:
            return -4.0 * z
   
    def fit(self):
        """
        Splits data into train and test and fits basic SVC estimator on it. 
        Then, it predicts labels and probabilities.
        ----------
        Returns
        -------
        None
        """"
        self.X_train, self.X_test, self.y_train, self.y_test=self.merge_and_split()
        self.estimator.fit(self.X_train,self.y_train)
        self.y_probas=svc.predict_proba(self.X_test)
        self.y_pred=svc.predict(self.X_test)
    
    def loss_per_instance(self):
        """
        Computes the loss per instance over all predictions made by the basic SVC estimator
        ----------
        Returns
        -------
        Huber loss per instance: int
        """"
        #map labels to 1 and -1 for compatibility with the huber loss function
        y_test_mapped=[-1 if label==0 else 1 for label in self.y_test]

        ip,total_loss=0,0
        
        #loop over prediction, sum and average the losses
        for index in self.y_test.index:
            total_loss+=self.huber_loss(self.y_probas[ip,1],y_test_mapped[ip])
            ip+=1
            
        return total_loss/ip
       