In [None]:
!pip install textcomplexity

In [None]:
!pip install --upgrade scikit-learn


In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
  

In [None]:
import pickle
import numpy as np
from tqdm.auto import trange, tqdm
from features import get_transformer, merge_entries
import json
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from utills import chunker
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV

In [None]:
!pip install plotly

In [None]:
from plotly.offline import init_notebook_mode
import plotly.offline as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [None]:
def fit_transformers(data_dict, data_fraction=0.01):
    docs_1 = []
    docs_2 = []
    #train dset
    num_entries = len(data_dict['text1'])
    
    for i in tqdm(range(num_entries), desc="Reading dataset"):
        if np.random.rand() < data_fraction:
            docs_1.append(data_dict['text1'][i])
            docs_2.append(data_dict['text2'][i])
           
    transformer = get_transformer()
    scaler = StandardScaler()
    secondary_scaler = StandardScaler()

    X = transformer.fit_transform(docs_1 + docs_2).todense()
    X = np.asarray(X)
    X = scaler.fit_transform(X)
    X1 = X[:len(docs_1)]
    X2 = X[len(docs_1):]
    secondary_scaler.fit(np.abs(X1 - X2))
    
    return transformer, scaler, secondary_scaler, X.shape[1]



In [None]:


def vectorize(XX, Y, ordered_idxs, transformer, scaler, secondary_scaler, data_dict, vector_Sz):
    batch_size = 10000
    docs1 = []
    docs2 = []
    idxs = []
    labels = []
    i = 0

    num_entries = len(data_dict['text1'])
    
    for idx in tqdm(range(num_entries), total=vector_Sz, desc="Vectorizing dataset"):
        docs1.append(data_dict['text1'][idx])
        docs2.append(data_dict['text2'][idx])
        labels.append(data_dict['score'][idx])
        idxs.append(ordered_idxs[i])
        i += 1
        
        if len(labels) >= batch_size:
            x1 = transformer.transform(docs1).todense()
            x2 = transformer.transform(docs2).todense()
            x1 = np.asarray(x1)
            x2 = np.asarray(x2)
            x1 = scaler.transform(x1)
            x2 = scaler.transform(x2)
            XX[idxs, :] = secondary_scaler.transform(np.abs(x1 - x2))
            Y[idxs] = labels

            docs1 = []
            docs2 = []
            idxs = []
            labels = []

        if len(labels) > 0:
            x1 = transformer.transform(docs1).todense()
            x2 = transformer.transform(docs2).todense()
            x1 = np.asarray(x1)
            x2 = np.asarray(x2)
            x1 = scaler.transform(x1)
            x2 = scaler.transform(x2)
            XX[idxs, :] = secondary_scaler.transform(np.abs(x1 - x2))
            Y[idxs] = labels
            XX[idxs, :] = secondary_scaler.transform(np.abs(x1-x2))
            Y[idxs] = labels
        XX.flush()
        Y.flush()

In [None]:
from huggingface_hub import HfApi, hf_hub_download
import pickle
import pandas as pd

# Initialize the HfApi
api = HfApi()
file_path = hf_hub_download(repo_id="swan07/process_chunks", filename="processed_eval.pkl", repo_type="dataset")
with open(file_path, "rb") as f:
    test = pickle.load(f)

In [None]:
with open('testdset.pkl', 'wb') as f:
    pickle.dump(test, f)

In [None]:

with open('testdset.pkl', "rb") as f:
    test = pickle.load(f)



In [None]:

with open('dset.pkl', "rb") as f:
    train = pickle.load(f)



In [169]:

with open('/workspace/testdsets.pickle', 'rb') as f:
    loaded_test_datasets = pickle.load(f)

In [170]:
type(loaded_test_datasets)

dict

In [None]:
test_dict = test

In [None]:
train_dict = train.to_dict()

KeyError: 'text1'

In [None]:
train_sz = len(train_dict['text1'])
test_sz = len(test_dict['text1'])

print('Train Sz:', train_sz, flush=True)
print('Test Sz:', test_sz, flush=True)

In [None]:
print('Fitting transformer...', flush=True)
transformer, scaler, secondary_scaler, feature_sz = fit_transformers(train_dict, data_fraction=0.05)


In [None]:
feature_sz

In [None]:
print('Vectorizing train set...', flush=True)
XX_train = np.memmap('vectorized_XX_train.npy', dtype='float32', mode='w+', shape=(train_sz, feature_sz))
Y_train = np.memmap('Y_train.npy', dtype='int32', mode='w+', shape=(train_sz))
train_idxs = np.array(range(train_sz))
np.random.shuffle(train_idxs)


In [None]:
trainaaa = [XX_train, 
    Y_train, 
    train_idxs, 
    transformer, 
    scaler, 
    secondary_scaler, 
    train_dict,
    train_sz]

In [None]:
from huggingface_hub import HfApi, login
api = HfApi()

In [None]:
login()

In [None]:
with open('trainaaa.pkl', 'wb') as f:
    pickle.dump(trainaaa, f)

In [None]:
api.upload_file(
    path_or_fileobj="trainaaa.pkl",
    path_in_repo="trainaaa.pkl",
    repo_id="swan07/process_chunks",
    repo_type="dataset",
)

In [None]:
vectorize(
    XX_train, 
    Y_train, 
    train_idxs, 
    transformer, 
    scaler, 
    secondary_scaler, 
    train_dict,
    train_sz
)

In [None]:
print('Vectorizing test set...', flush=True)
XX_test = np.memmap('vectorized_XX_test.npy', dtype='float32', mode='w+', shape=(test_sz, feature_sz))
Y_test = np.memmap('Y_test.npy', dtype='int32', mode='w+', shape=(test_sz))
test_idxs = np.array(range(test_sz))
np.random.shuffle(test_idxs)


In [150]:
import cupy as cp
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import numpy as np

def process_batch_gpu(batch_data, transformer, scaler, secondary_scaler):
    docs1, docs2, labels, idxs = batch_data

    print(f"Processing batch with {len(docs1)} documents")

    # Check for validity of the batch
    for doc1, doc2 in zip(docs1, docs2):
        if not doc1.get('tokens') or not doc2.get('tokens'):
            print("Skipping batch due to empty tokens.")
            return [], [], []
        if len(doc1['tokens']) != len(doc1.get('pos_tags', [])) or len(doc2['tokens']) != len(doc2.get('pos_tags', [])):
            print("Skipping batch due to mismatched lengths of tokens and pos_tags.")
            return [], [], []

    x1 = transformer.transform(docs1).todense()
    x2 = transformer.transform(docs2).todense()

    x1 = cp.asarray(x1)
    x2 = cp.asarray(x2)

    x1 = scaler.transform(cp.asnumpy(x1))
    x2 = scaler.transform(cp.asnumpy(x2))

    return idxs, secondary_scaler.transform(np.abs(x1 - x2)), labels

def vectorize_gpu(XX, Y, ordered_idxs, transformer, scaler, secondary_scaler, data_dict, batch_size=50000):
    docs1 = []
    docs2 = []
    idxs = []
    labels = []
    i = 0

    num_entries = len(data_dict['text1'])

    progress_bar = tqdm(total=num_entries, desc="Vectorizing dataset", ncols=100)

    for idx in range(num_entries):
        docs1.append(data_dict['text1'][idx])
        docs2.append(data_dict['text2'][idx])
        labels.append(data_dict['score'][idx])
        idxs.append(ordered_idxs[i])
        i += 1

        if len(labels) >= batch_size:
            batch_data = (docs1, docs2, labels, idxs)
            idxs, transformed_data, batch_labels = process_batch_gpu(batch_data, transformer, scaler, secondary_scaler)
            if idxs:  # Only update if the batch was processed successfully
                XX[idxs, :] = transformed_data
                Y[idxs] = batch_labels

            docs1 = []
            docs2 = []
            idxs = []
            labels = []

            progress_bar.update(len(idxs))  # Manually update progress

    # Handle remaining data
    if len(labels) > 0:
        batch_data = (docs1, docs2, labels, idxs)
        idxs, transformed_data, batch_labels = process_batch_gpu(batch_data, transformer, scaler, secondary_scaler)
        if idxs:  # Only update if the batch was processed successfully
            XX[idxs, :] = transformed_data
            Y[idxs] = batch_labels
            progress_bar.update(len(idxs))  # Manually update progress

    progress_bar.close()
    XX.flush()
    Y.flush()
    
batch_size = 5000  # Increased batch size

In [None]:

vectorize_gpu(
    XX_test, 
    Y_test, 
    test_idxs, 
    transformer, 
    scaler, 
    secondary_scaler, 
    test_dict,
    batch_size
)

In [140]:
print(3)

3


In [151]:

vectorize_gpu(
    XX_train, 
    Y_train, 
    train_idxs, 
    transformer, 
    scaler, 
    secondary_scaler, 
    train_dict,
    batch_size
)














Vectorizing dataset:   0%|                                                | 0/97584 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents


Vectorizing dataset:   0%|                                              | 0/30780 [1:46:29<?, ?it/s]
Vectorizing dataset:   0%|                                              | 0/30780 [1:44:18<?, ?it/s]
Vectorizing dataset:   0%|                                              | 0/30780 [1:42:59<?, ?it/s]
Vectorizing dataset:   0%|                                              | 0/30780 [1:42:42<?, ?it/s]
Vectorizing dataset:   0%|                                              | 0/30780 [1:42:32<?, ?it/s]
Vectorizing dataset:   0%|                                              | 0/30780 [1:41:55<?, ?it/s]
Vectorizing dataset:   0%|                                              | 0/30780 [1:42:25<?, ?it/s]
Vectorizing dataset:   0%|                                                | 0/97584 [58:27<?, ?it/s]
Vectorizing dataset:   0%|                                                | 0/30780 [30:33<?, ?it/s]
Vectorizing dataset:   0%|                                                | 0/97584 [28:43<

Processing batch with 5000 documents
Skipping batch due to mismatched lengths of tokens and pos_tags.
Processing batch with 5000 documents
Skipping batch due to empty tokens.
Processing batch with 5000 documents















Vectorizing dataset:   0%|                                                | 0/97584 [02:01<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents
Skipping batch due to empty tokens.
Processing batch with 5000 documents















Vectorizing dataset:   0%|                                                | 0/97584 [03:04<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents
Skipping batch due to empty tokens.
Processing batch with 5000 documents















Vectorizing dataset:   0%|                                                | 0/97584 [03:57<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents















Vectorizing dataset:   0%|                                                | 0/97584 [06:13<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents















Vectorizing dataset:   0%|                                                | 0/97584 [07:22<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents















Vectorizing dataset:   0%|                                                | 0/97584 [08:58<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents















Vectorizing dataset:   0%|                                                | 0/97584 [10:09<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents















Vectorizing dataset:   0%|                                                | 0/97584 [11:19<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents















Vectorizing dataset:   0%|                                                | 0/97584 [14:56<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents















Vectorizing dataset:   0%|                                                | 0/97584 [27:15<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents















Vectorizing dataset:   0%|                                                | 0/97584 [40:38<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents















Vectorizing dataset:   0%|                                                | 0/97584 [53:21<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents















Vectorizing dataset:   0%|                                              | 0/97584 [1:06:13<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 5000 documents















Vectorizing dataset:   0%|                                              | 0/97584 [1:19:18<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing batch with 2584 documents















Vectorizing dataset:   3%|▊                               | 2584/97584 [1:25:56<52:39:51,  2.00s/it][A[A[A[A[A[A[A[A[A[A[A[A[A


In [None]:

# vectorize(
#     XX_test, 
#     Y_test, 
#     test_idxs, 
#     transformer, 
#     scaler, 
#     secondary_scaler, 
#     test_dict,
#     test_sz
# )

In [153]:
print(1)

1


In [155]:
print('Tuning parameters...', flush=True)


param_dist = {'alpha': loguniform(1e-4, 1e0)}
batch_size=100
clf = SGDClassifier(loss='log_loss', alpha=0.01)
n_iter_search = 2
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, verbose=2)
for idxs in chunker(range(train_sz), batch_size):
        random_search.fit(XX_train[idxs, :], Y_train[idxs])
        break

print('Best params:', random_search.best_params_)


Tuning parameters...
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] END ........................alpha=0.0002069836309284027; total time=   0.0s
[CV] END ........................alpha=0.0002069836309284027; total time=   0.0s
[CV] END ........................alpha=0.0002069836309284027; total time=   0.1s
[CV] END ........................alpha=0.0002069836309284027; total time=   0.0s
[CV] END ........................alpha=0.0002069836309284027; total time=   0.0s
[CV] END ...........................alpha=0.3257111378906439; total time=   0.1s
[CV] END ...........................alpha=0.3257111378906439; total time=   0.0s
[CV] END ...........................alpha=0.3257111378906439; total time=   0.0s
[CV] END ...........................alpha=0.3257111378906439; total time=   0.1s
[CV] END ...........................alpha=0.3257111378906439; total time=   0.1s
Best params: {'alpha': 0.0002069836309284027}


In [164]:
print('Training classifier...', flush=True)
clf = SGDClassifier(loss='log_loss', alpha=random_search.best_params_['alpha'])
batch_size=50000
num_epochs = 50
aucs = []
for i in trange(num_epochs):
    print('Epoch - ', i)
    print('-' * 30)
    for idxs in chunker(range(train_sz), batch_size):
        clf.partial_fit(XX_train[idxs, :], Y_train[idxs], classes=[0, 1])

    probs = clf.predict_proba(XX_test)[:, 1]
    fpr, tpr, thresh = roc_curve(Y_test, probs)
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    print('AUC: ', roc_auc)
    with open('/workspace/featurevector/temp_data/experiment_data.p', 'wb') as f:
        pickle.dump((
            aucs,
            clf,
            roc_auc,
            transformer, 
            scaler,
            secondary_scaler,
            feature_sz,
            train_sz,
            train_idxs,
            test_sz,
            test_idxs
        ), f)

Training classifier...


  0%|          | 0/50 [00:00<?, ?it/s]

Epoch -  0
------------------------------
AUC:  0.570031163380811
Epoch -  1
------------------------------
AUC:  0.6144407247670979
Epoch -  2
------------------------------
AUC:  0.600848206765084
Epoch -  3
------------------------------
AUC:  0.6101171930580097
Epoch -  4
------------------------------
AUC:  0.6255071343073737
Epoch -  5
------------------------------
AUC:  0.6322024796441608
Epoch -  6
------------------------------
AUC:  0.6444499542956785
Epoch -  7
------------------------------
AUC:  0.6548354116661073
Epoch -  8
------------------------------
AUC:  0.6569468246579565
Epoch -  9
------------------------------
AUC:  0.627805852686422
Epoch -  10
------------------------------
AUC:  0.6428407019677135
Epoch -  11
------------------------------
AUC:  0.6209748003470572
Epoch -  12
------------------------------
AUC:  0.6515784034942167
Epoch -  13
------------------------------
AUC:  0.65471170600616
Epoch -  14
------------------------------
AUC:  0.645110672011

In [None]:
go.Figure(go.Scatter(
    x=np.arange(len(aucs)),
    y=aucs
))

In [166]:
with open('large_model.p', 'wb') as f:
    pickle.dump((clf, transformer, scaler, secondary_scaler), f)

In [167]:
from pan20_verif_evaluator import evaluate_all
results = evaluate_all(Y_test, probs)
print(results)

{'auc': 0.646, 'c@1': 0.599, 'f_05_u': 0.606, 'F1': 0.653, 'brier': 0.627, 'overall': 0.626}


In [193]:
del loaded_test_datasets['pan14']


In [188]:
for name, dataset in loaded_test_datasets.items():
    print(f"Evaluating {name}")


Evaluating darkreddit
Evaluating imdb
Evaluating pan11
Evaluating pan13
Evaluating pan14
Evaluating pan15
Evaluating pan20
Evaluating reuters
Evaluating victorian


In [None]:
from features import prepare_entry
def vectorize_and_evaluate(dataset, transformer, scaler, secondary_scaler, clf):
    probs = []
    Y_test = []

    for example in tqdm(dataset, desc="Processing entries"):
        text1 = example['text1']
        text2 = example['text2']
        score = example['same']

        

        preprocessed_doc1 = prepare_entry(text1, mode='fast', tokenizer='casual')
        preprocessed_doc2 = prepare_entry(text2, mode='fast', tokenizer='casual')

        if not preprocessed_doc1 or not preprocessed_doc2:
            print(f"Warning: parsing empty text for example with score {score}")
            continue

        if len(preprocessed_doc1['tokens']) == 0 or len(preprocessed_doc2['tokens']) == 0:
            print(f"Warning: parsing empty text for example with score {score}")
            continue

        try: 
            X1 = np.asarray(transformer.transform([preprocessed_doc1]).todense())
            X2 = np.asarray(transformer.transform([preprocessed_doc2]).todense())
        except:
            print(preprocessed_doc1)
            print(preprocessed_doc2)
            continue
        
        
        X1 = scaler.transform(X1)
        X2 = scaler.transform(X2)
        
        X = secondary_scaler.transform(np.abs(X1 - X2))
        
        prob = clf.predict_proba(X)[0, 1]
        probs.append(prob)
        Y_test.append(score)

    return Y_test, probs


# Evaluate all datasets
results_dict = {}
for name, dataset in loaded_test_datasets.items():
    print(f"Evaluating {name}")
    Y_test, probs = vectorize_and_evaluate(dataset, transformer, scaler, secondary_scaler, clf)
    results = evaluate_all(Y_test, probs)
    results_dict[name] = results
    print(results)

# Print all results
for dataset_name, results in results_dict.items():
    print(f"Results for {dataset_name}:")
    print(results)


Evaluating pan15


Processing entries: 100%|██████████| 200/200 [00:22<00:00,  8.71it/s]


{'auc': 0.629, 'c@1': 0.565, 'f_05_u': 0.596, 'F1': 0.636, 'brier': 0.601, 'overall': 0.605}
Evaluating pan20


Processing entries:  89%|████████▊ | 12148/13704 [1:39:28<11:18,  2.29it/s]  

In [None]:
print(3)