In [3]:
import importlib
import random
from sklearn.metrics import precision_recall_fscore_support
from authorship_attribution import assert_opt_in
from model.pair_impostors import PairImpostors, minmax, optimize_sigma, cosine, _impostors_task_sav
from utils.result_manager import SAVResult, check_if_already_performed, count_results
from feature_extraction.author_vectorizer1 import FeatureExtractor
from model.pair_classification import PairAAClassifier, DistanceSAVClassifier, PairSAVClassifier
from utils.evaluation import *
import os
from utils.common import get_verification_coordinates, get_parallel_slices, random_sample, prepare_learner

import argparse
from tqdm import tqdm
import pickle
from sklearn.base import clone
from joblib import Parallel, delayed
import itertools
import pathlib
from tqdm import tqdm

In [4]:
def process_data(train, test, rawfreq=False):
    Xtr = []
    ytr = []
    Xte = []
    yte = []
    
    # Processing training data with a progress bar
    print("Processing training data...")
    for row in tqdm(train, desc="Training data"):
        Xtr.append(f"{row['text1']} {row['text2']}")
        ytr.append(row['score'])
    
    # Processing testing data with a progress bar
    print("Processing testing data...")
    for row in tqdm(test, desc="Testing data"):
        Xte.append(f"{row['text1']} {row['text2']}")
        yte.append(row['score'])
    
    # Feature extraction
    print(f'using raw_freq = {rawfreq}')
    vectorizer = FeatureExtractor('english', cleaning=False, use_raw_frequencies=rawfreq,
                                  function_words=True,
                                  word_lengths=True,
                                  sentence_lengths=True,
                                  punctuation=True,
                                  post_ngrams=True,
                                  word_ngrams=True,
                                  char_ngrams=True)
    print("Fitting vectorizer on training data...")
    Xtr = vectorizer.fit_transform(Xtr, ytr)

    print("Transforming testing data...")
    Xte = vectorizer.transform(Xte, None)
    
    return Xtr, ytr, Xte, yte


In [5]:
with open('/workspace/train.pkl', 'rb') as f:
    train_df = pickle.load(f)
with open('/workspace/val.pkl', 'rb') as f:
    test_df = pickle.load(f)

In [11]:
train = train_df
test = test_df

In [None]:
# Process the data
Xtr, ytr, Xte, yte = process_data(train, test, rawfreq=False)


Processing training data...


Training data: 100%|██████████| 325288/325288 [00:21<00:00, 15484.53it/s]


Processing testing data...


Testing data: 100%|██████████| 30781/30781 [00:01<00:00, 18872.43it/s]


using raw_freq = False
Fitting vectorizer on training data...


POST job 0: 100%|██████████| 1275/1275 [00:38<00:00, 33.36it/s]]
POST job 11: 100%|██████████| 1275/1275 [00:39<00:00, 31.91it/s]
POST job 22: 100%|██████████| 1275/1275 [00:41<00:00, 30.76it/s]
POST job 20: 100%|██████████| 1275/1275 [00:41<00:00, 30.60it/s]
POST job 18: 100%|██████████| 1275/1275 [00:41<00:00, 30.40it/s]
POST job 13: 100%|██████████| 1275/1275 [00:42<00:00, 30.26it/s]
POST job 16: 100%|██████████| 1275/1275 [00:42<00:00, 30.32it/s]
POST job 15: 100%|██████████| 1275/1275 [00:41<00:00, 30.71it/s]
POST job 24: 100%|██████████| 1275/1275 [00:41<00:00, 30.37it/s]
POST job 3: 100%|██████████| 1275/1275 [00:42<00:00, 30.25it/s]
POST job 8: 100%|██████████| 1275/1275 [00:41<00:00, 30.69it/s]]
POST job 2: 100%|██████████| 1275/1275 [00:42<00:00, 30.17it/s]]
POST job 26: 100%|██████████| 1275/1275 [00:42<00:00, 30.15it/s]
POST job 6: 100%|██████████| 1275/1275 [00:42<00:00, 29.87it/s]]
POST job 1: 100%|██████████| 1275/1275 [00:43<00:00, 29.62it/s]]
POST job 23: 100%|████████