In [1]:
# !nvidia-smi

In [1]:
import sys
sys.path.append("../") # go to parent dir

In [2]:
import logging
import pandas as pd
import pickle
import os
import torch
import numpy as np
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder

from typing import List, Tuple, Dict

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BartForConditionalGeneration, BartTokenizer

In [3]:
%load_ext autoreload
%autoreload 2
from utils import *

In [4]:
GPU = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = GPU

if torch.cuda.is_available():    
    print(torch.cuda.current_device())
    print(torch.cuda.device)
    torch.cuda.device = GPU
    print(torch.cuda.device)
    device = 'cuda'
else:
    device = 'cpu'
print(f'device: {device}')

0
<class 'torch.cuda.device'>
1
device: cuda


In [6]:
model_path = "../resources/models/muss_en_mined_hf"
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path, output_attentions=True)
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

model = model.eval()
model = model.to(device)

In [7]:
sentences = read_lines('../resources/data/examples.en')

def construct_dataset(sentences: List[str], ctrl_token: str = 'len_ratio', step_size: float = 0.25):

    inputs = []
    labels = []
    for sent in sentences:
        # possible combinations = num_trials^4
        
        for a in np.arange(0.25, 1.0, step_size):
            a = round(a, 2) # target values = 0.0, 0.05, 0.1, 0.15, ...

            if ctrl_token == 'len_ratio':
                params = {
                    'len_ratio': a,
                    'lev_sim': 1.0,
                    'word_rank': 1.0,
                    'tree_depth': 1.0,
                }
            elif ctrl_token == 'lev_sim':
                params = {
                    'len_ratio': 1.0,
                    'lev_sim': a,
                    'word_rank': 1.0,
                    'tree_depth': 1.0,
                }
            elif ctrl_token == 'word_rank':
                params = {
                    'len_ratio': 1.0,
                    'lev_sim': 1.0,
                    'word_rank': a,
                    'tree_depth': 1.0,
                }
            elif ctrl_token == 'tree_depth':
                params = {
                    'len_ratio': 1.0,
                    'lev_sim': 1.0,
                    'word_rank': 1.0,
                    'tree_depth': a,
                }
            
            inputs.append(construct_input_for_access(sent, params))
            labels.append(a)      

    assert len(inputs) == len(labels)
    
    print(f'Constructed {len(inputs)} inputs')
    
    return inputs, labels

        
def get_hidden_states(sentences, model, tokenizer, batch_size: int = 12):
    
    for indx in range(0, len(sentences), batch_size):
        batch_sentences = sentences[indx:min(indx + batch_size, len(sentences))]
        print(len(batch_sentences))
        batch = tokenizer(batch_sentences, padding='max_length', return_tensors="pt").to(model.device)
        encoder_outputs = model.get_encoder()(batch['input_ids'], return_dict=True, output_hidden_states=True, output_attentions=True)
        yield encoder_outputs.last_hidden_state

def average_states(batch_states: torch.Tensor) -> torch.Tensor:
    '''
    creates a feature matrix with one row per sentence by averaging the tokens per sentence.
    states: batch_size * max_seq_len (1024) * hidden_dim (1024)

    Returns:
        X           the averaged feature matrix
    '''
    print(batch_states.shape)
    print(batch_states[0].shape)
    
    X = batch_states.mean(1)
    print(X.shape)

    return X.detach().cpu().numpy()


# def save_to_file(encoded)


In [8]:
raw_inputs, raw_labels = construct_dataset(sentences)
# print(raw_inputs, raw_labels)

Constructed 18 inputs


In [23]:
x_train = []
for hidden_states in get_hidden_states(raw_inputs, model, tokenizer, 1):
    x = average_states(hidden_states)
    x_train.append(x)

x_train = np.stack(x_train)
#     print(type(x))

1
torch.Size([1, 1024, 1024])
torch.Size([1024, 1024])
torch.Size([1, 1024])
1
torch.Size([1, 1024, 1024])
torch.Size([1024, 1024])
torch.Size([1, 1024])
1
torch.Size([1, 1024, 1024])
torch.Size([1024, 1024])
torch.Size([1, 1024])
1
torch.Size([1, 1024, 1024])
torch.Size([1024, 1024])
torch.Size([1, 1024])
1
torch.Size([1, 1024, 1024])
torch.Size([1024, 1024])
torch.Size([1, 1024])
1
torch.Size([1, 1024, 1024])
torch.Size([1024, 1024])
torch.Size([1, 1024])
1
torch.Size([1, 1024, 1024])
torch.Size([1024, 1024])
torch.Size([1, 1024])
1
torch.Size([1, 1024, 1024])
torch.Size([1024, 1024])
torch.Size([1, 1024])
1
torch.Size([1, 1024, 1024])
torch.Size([1024, 1024])
torch.Size([1, 1024])
1
torch.Size([1, 1024, 1024])
torch.Size([1024, 1024])
torch.Size([1, 1024])
1
torch.Size([1, 1024, 1024])
torch.Size([1024, 1024])
torch.Size([1, 1024])
1
torch.Size([1, 1024, 1024])
torch.Size([1024, 1024])
torch.Size([1, 1024])
1
torch.Size([1, 1024, 1024])
torch.Size([1024, 1024])
torch.Size([1, 1024])

array([[[ 0.18944564, -0.11129951, -0.10309957, ..., -0.04129167,
          0.080787  , -0.17564222]],

       [[ 0.18934718, -0.11219722, -0.1028005 , ..., -0.0404486 ,
          0.0813223 , -0.17640522]],

       [[ 0.18907292, -0.11207882, -0.10220982, ..., -0.04000543,
          0.08089276, -0.17623329]],

       ...,

       [[ 0.1883909 , -0.11119529, -0.09323843, ..., -0.05045968,
          0.1017652 , -0.16385254]],

       [[ 0.1885542 , -0.11209796, -0.09294678, ..., -0.04987385,
          0.1023041 , -0.16487736]],

       [[ 0.18803838, -0.11190365, -0.0923159 , ..., -0.04913919,
          0.10175003, -0.16446602]]], dtype=float32)

In [14]:
labeller = LabelEncoder()
y = labeller.fit_transform(raw_labels)

#view transformed values
print(y)

[0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2]


In [16]:
from sklearn.utils import shuffle
x_train, y = shuffle(x_train, y, random_state=42)

In [22]:
x_train

[array([[ 0.18944564, -0.11129951, -0.10309957, ..., -0.04129167,
          0.080787  , -0.17564222]], dtype=float32),
 array([[ 0.18934718, -0.11219722, -0.1028005 , ..., -0.0404486 ,
          0.0813223 , -0.17640522]], dtype=float32),
 array([[ 0.19518192, -0.11354618, -0.09207138, ..., -0.04744828,
          0.08278838, -0.17461583]], dtype=float32),
 array([[ 0.19659495, -0.11638469, -0.10364174, ..., -0.04273542,
          0.09063621, -0.17947604]], dtype=float32),
 array([[ 0.19707559, -0.11565908, -0.1046043 , ..., -0.0439654 ,
          0.09055544, -0.17893913]], dtype=float32),
 array([[ 0.20172504, -0.11770041, -0.09758388, ..., -0.05647797,
          0.09202515, -0.17125711]], dtype=float32),
 array([[ 0.1885542 , -0.11209796, -0.09294678, ..., -0.04987385,
          0.1023041 , -0.16487736]], dtype=float32),
 array([[ 0.1883909 , -0.11119529, -0.09323843, ..., -0.05045968,
          0.1017652 , -0.16385254]], dtype=float32),
 array([[ 0.18799928, -0.11443926, -0.10565773, 

In [21]:
def train_linear_model(
    X_train: np.ndarray,
    y_train: np.ndarray,
    probe_token: str,
    out_dir: str = None,
    ):
    '''
    Fits classifiers for a probing task experiment.
    Args:
        X_train     training features
        y_train     training labels
        probe_token     name of the ctrl token used for naming files
        out_dir     output directory
    '''
    linear_clf_params = {'C': 0.0001,
                         'max_iter': 100,
#                          'solver': 'liblinear',
                         'solver': 'lbfgs',
#                          multi_class='multinomial', solver=''
                         'tol': 1e-4,
                         'verbose':100,
                         'multi_class': 'multinomial',
                         }
    
    clf = LogisticRegression(**linear_clf_params)

    clf_name = str(clf).split('(')[0]
    print(f'fitting {clf_name} classifier on {probe_token} token...')
    
    clf.fit(X_train, y_train)
    if out_dir is not None:
        Path(out_dir).mkdir(parents=True, exist_ok=True)
        out_path = Path(out_dir) / f'clf_{clf_name}_{probe_token}.pkl'
        with open(str(out_path), 'wb') as outfile:
            pickle.dump(clf, outfile)

    return clf

clf = train_linear_model(x_train, y, 'len_ratio')


fitting LogisticRegression classifier on len_ratio token...


ValueError: Found array with dim 3. LogisticRegression expected <= 2.

In [None]:
# def train_mlp_model(
#     X_train: np.ndarray,
#     y_train: np.ndarray,
#     probe_token: str,
#     out_dir: str
#     ):
    
#     mlp_clf_params = {'activation': 'relu',
#                   'alpha': 0.0001,
#                   'beta_1': 0.9,
#                   'epsilon': 10e-8,
#                   'hidden_layer_sizes': (100,),
#                   'learning_rate_init': 0.0001,
#                   'solver': 'adam',
#                   'early_stopping':True,
#                   'n_iter_no_change':10,
#                   'validation_fraction':0.02,
#                   'verbose':True
#                   }

#     mlp_clf = MLPClassifier(**mlp_clf_params)


In [70]:
X = X.detach().numpy()

In [54]:
# b = torch.tensor([[1.0, 4.0], [2.0, 4.0]])
# print(b.shape)
# print(b.mean(0))
# print(b)
# torch.reshape(b, (-1,))

torch.Size([2, 1, 3])
tensor([[1.5000, 4.0000, 5.0000]])
tensor([[[1., 4., 5.]],

        [[2., 4., 5.]]])


tensor([1., 4., 5., 2., 4., 5.])

In [None]:
def create_dataset(source_matrix: torch.Tensor, bt_matrix: torch.Tensor) -> Tuple[pd.DataFrame, pd.Series]:
    '''
    creates an X matrix and a y vector for sklearn training from two tensors.
    source tensor receives label 0, bt tensor receives label 1.
    Args:
        source_matrix   a torch.Tensor, the feature matrix for genuine text
        bt_matrix       a torch.Tensor, the feature matrix for bt
    '''
    # creating DataFrames from the tensors and adding labels
    source_df = pd.DataFrame(source_matrix.cpu().numpy())
    source_df['label'] = [0] * len(source_df)
    bt_df = pd.DataFrame(bt_matrix.cpu().numpy())
    bt_df['label'] = [1] * len(bt_df)
    
    # combining and shuffling data
    combined = source_df.append(bt_df, ignore_index=True)
    combined = combined.sample(frac=1)

    X = combined.drop(columns=['label'])
    y = combined['label'].copy()

    return X, y

# def train_models(X_train: pd.DataFrame,
#                  y_train: pd.DataFrame,
#                  bt_name: str,
#                  experiment: str,
#                  out_dir: str,
#                  logger: logging.Logger):
#     '''
#     Fits classifiers for a probing task experiment.
#     Args:
#         X_train     training features
#         y_train     training labels
#         bt_name     name of the bt dataset used for naming files
#         experiment  the name of the experiment
#         out_dir     output directory
#         logger      a logging.Logger instance
#     '''
#     linear_clf_params = {'C': 0.0001,
#                          'max_iter': 100,
#                          'solver': 'liblinear',
#                          'tol': 1e-4,
#                          'verbose':100
#                          }
#     mlp_clf_params = {'activation': 'relu',
#                       'alpha': 0.0001,
#                       'beta_1': 0.9,
#                       'epsilon': 10e-8,
#                       'hidden_layer_sizes': (100,),
#                       'learning_rate_init': 0.0001,
#                       'solver': 'adam',
#                       'early_stopping':True,
#                       'n_iter_no_change':10,
#                       'validation_fraction':0.02,
#                       'verbose':True
#                       }
#     linear_clf = LogisticRegression(**linear_clf_params)
#     mlp_clf = MLPClassifier(**mlp_clf_params)

#     for clf in [linear_clf, mlp_clf]:
#         clf_name = str(clf).split('(')[0]
#         logger.info(f'fitting {clf_name} classifier on {bt_name} with {experiment}...')
#         clf.fit(X_train, y_train)
#         with open(os.path.join(out_dir, f'clf_{experiment}_{bt_name}_{clf_name}_fitted.pkl'), 'wb') as outfile:
#             pickle.dump(clf, outfile)

In [None]:
def main(args: argparse.Namespace):
    logging.basicConfig(
        format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        level=logging.INFO,
    )
    logger = logging.getLogger('train_models.py')

    out_dir = os.path.abspath(args.out_dir)

    max_len = get_max_seq_len([args.genuine, args.bt], logger)

    # training for padding experiment
    padded_genuine_train = pad_states(os.path.join(args.genuine), max_len, logger)
    padded_bt_train = pad_states(os.path.join(args.bt), max_len, logger)

    padded_X_train, padded_y_train = create_dataset(padded_genuine_train, padded_bt_train)

    train_models(padded_X_train,
                 padded_y_train,
                 args.bt_name,
                 'padding',
                 out_dir,
                 logger)

    # training for averaging experiment
    averaged_genuine_train = average_states(os.path.join(args.genuine), logger)
    averaged_bt_train = average_states(os.path.join(args.bt), logger)

    averaged_X_train, averaged_y_train = create_dataset(averaged_genuine_train, averaged_bt_train)

    train_models(averaged_X_train,
                 averaged_y_train,
                 args.bt_name,
                 'averaging',
                 out_dir,
                 logger)


