In [7]:
import importlib
import random
from sklearn.metrics import precision_recall_fscore_support
from authorship_attribution import assert_opt_in
from model.pair_impostors import PairImpostors, minmax, optimize_sigma, cosine, _impostors_task_sav
from utils.result_manager import SAVResult, check_if_already_performed, count_results
from feature_extraction.author_vectorizer import FeatureExtractor
from model.pair_classification import PairAAClassifier, DistanceSAVClassifier, PairSAVClassifier
from utils.evaluation import *
import os
from utils.common import get_verification_coordinates, get_parallel_slices, random_sample, prepare_learner

import argparse
from tqdm import tqdm
import pickle
from sklearn.base import clone
from joblib import Parallel, delayed
import itertools
import pathlib

In [17]:
from tqdm import tqdm

In [20]:
def process_data(train, test, rawfreq=False):
    Xtr = []
    ytr = []
    Xte = []
    yte = []
    
    # Processing training data with a progress bar
    print("Processing training data...")
    for row in tqdm(train, desc="Training data"):
        Xtr.append(f"{row['text1']} {row['text2']}")
        ytr.append(row['score'])
    
    # Processing testing data with a progress bar
    print("Processing testing data...")
    for row in tqdm(test, desc="Testing data"):
        Xte.append(f"{row['text1']} {row['text2']}")
        yte.append(row['score'])
    
    # Feature extraction
    print(f'using raw_freq = {rawfreq}')
    vectorizer = FeatureExtractor('english', cleaning=False, use_raw_frequencies=rawfreq,
                                  function_words=True,
                                  word_lengths=True,
                                  sentence_lengths=True,
                                  punctuation=True,
                                  post_ngrams=True,
                                  word_ngrams=True,
                                  char_ngrams=True)
    Xtr = vectorizer.fit_transform(Xtr, ytr)
    Xte = vectorizer.transform(Xte, None)
    
    return Xtr, ytr, Xte, yte


In [8]:
# # Function to process the data
# def process_data(train, test, rawfreq=False):
#     Xtr = [f"{row['text1']} {row['text2']}" for row in train]
#     ytr = [row['score'] for row in train]
#     Xte = [f"{row['text1']} {row['text2']}" for row in test]
#     yte = [row['score'] for row in test]
    
#     # Feature extraction
#     print(f'using raw_freq = {rawfreq}')
#     vectorizer = FeatureExtractor('english', cleaning=False, use_raw_frequencies=rawfreq,
#                                   function_words=True,
#                                   word_lengths=True,
#                                   sentence_lengths=True,
#                                   punctuation=True,
#                                   post_ngrams=True,
#                                   word_ngrams=True,
#                                   char_ngrams=True)
#     Xtr = vectorizer.fit_transform(Xtr, ytr)
#     Xte = vectorizer.transform(Xte, None)
    
#     return Xtr, ytr, Xte, yte

In [9]:
with open('/workspace/train.pkl', 'rb') as f:
    train_df = pickle.load(f)

In [32]:
test = test_df['train']

In [None]:
# Process the data
Xtr, ytr, Xte, yte = process_data(train, test, rawfreq=False)


Processing training data...


Training data: 100%|██████████| 325288/325288 [00:20<00:00, 15844.05it/s]


Processing testing data...


Testing data: 100%|██████████| 30781/30781 [00:01<00:00, 18616.64it/s]

using raw_freq = False



POST job 0: 100%|██████████| 1275/1275 [00:37<00:00, 34.13it/s]]
POST job 15: 100%|██████████| 1275/1275 [00:42<00:00, 30.23it/s]
POST job 2: 100%|██████████| 1275/1275 [00:43<00:00, 29.57it/s]]
POST job 25: 100%|██████████| 1275/1275 [00:42<00:00, 29.70it/s]
POST job 3: 100%|██████████| 1275/1275 [00:43<00:00, 29.44it/s]]
POST job 11: 100%|██████████| 1275/1275 [00:43<00:00, 29.12it/s]
POST job 7: 100%|██████████| 1275/1275 [00:44<00:00, 28.69it/s]]
POST job 8: 100%|██████████| 1275/1275 [00:44<00:00, 28.83it/s]]
POST job 6: 100%|██████████| 1275/1275 [00:45<00:00, 28.12it/s]]
POST job 23: 100%|██████████| 1275/1275 [00:45<00:00, 28.28it/s]
POST job 12: 100%|██████████| 1275/1275 [00:44<00:00, 28.35it/s]
POST job 10: 100%|██████████| 1275/1275 [00:45<00:00, 28.14it/s]
POST job 4: 100%|██████████| 1275/1275 [00:45<00:00, 27.79it/s]]
POST job 16: 100%|██████████| 1275/1275 [00:45<00:00, 28.04it/s]
POST job 1: 100%|██████████| 1275/1275 [00:45<00:00, 28.00it/s]]
POST job 17: 100%|██████

In [None]:
# Classifier instantiation
learners = {
    'LR': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'SGD': SGDClassifier(max_iter=1000)
}

# Choose a learner
learner_name = 'LR'  # Change to 'SVM' or 'SGD' as needed
base_learner = learners[learner_name]

# Training the classifier
print(f'Training {learner_name} classifier...')
base_cls = clone(base_learner)
base_cls.fit(Xtr, ytr)

In [None]:
# Verifying the performance
def verification(cls, Xte, yte):
    pred = cls.predict(Xte)
    correct = sum([p == t for p, t in zip(pred, yte)])
    total = len(yte)
    
    acc = correct * 100. / total
    p, r, f1, _ = precision_recall_fscore_support(yte, pred, average='binary', pos_label=1)
    print(f'acc={acc:.3f}% p={p:.3f} r={r:.3f} f1={f1:.3f}')
    
    return acc, p, r, f1

In [None]:
print('Verifying the performance...')
verification(base_cls, Xte, yte)