In [1]:
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.tokenize import TweetTokenizer
from features import get_transformer, prepare_entry
from tqdm import  tqdm
import json
import re
import os
import string
import argparse
import sys


In [2]:
MODEL_FILE = 'large_model.p'


In [3]:
def process_batch(transformer, scaler, secondary_scaler, clf, ids, preprocessed_docs1, preprocessed_docs2, output_file):
    print('Extracting features:', len(ids), file=sys.stderr)
    X1 = scaler.transform(transformer.transform(preprocessed_docs1).todense())
    X2 = scaler.transform(transformer.transform(preprocessed_docs2).todense())
    X = secondary_scaler.transform(np.abs(X1 - X2))
    print('Predicting...', file=sys.stderr)
    probs = clf.predict_proba(X)[:, 1]
    print('Writing to', output_file, file=sys.stderr)
    with open(output_file, 'a') as f:
        for i in range(len(ids)):
            d = {
                'id': ids[i],
                'value': probs[i]
            }
            json.dump(d, f)
            f.write('\n')

In [23]:
def process_single_entry(transformer, scaler, secondary_scaler, clf, idx, preprocessed_doc1, preprocessed_doc2, f_output_file):    
    try:
        X1 = np.asarray(transformer.transform([preprocessed_doc1]).todense())
        X2 = np.asarray(transformer.transform([preprocessed_doc2]).todense())
        
        # Scale the data
        X1 = scaler.transform(X1)
        X2 = scaler.transform(X2)
        
        # Calculate the absolute difference and apply secondary scaling
        X = secondary_scaler.transform(np.abs(X1 - X2))
        
        # Predict the probability
        prob = clf.predict_proba(X)[0, 1]
    except Exception as e:
        print('Exception predicting:', e)
        prob = 0.5
    d = {
        'id': idx,
        'value': prob
    }
    print(prob)
    json.dump(d, f_output_file)
    f_output_file.write('\n')

In [25]:
# parser = argparse.ArgumentParser(description='Prediction Script: PAN 2021')
# parser.add_argument('-i', type=str,
#                     help='Evaluaiton dir')
# parser.add_argument('-o', type=str, 
#                     help='Output dir')
# args = parser.parse_args()

# # validate:
# if not args.i:
#     raise ValueError('Eval dir path is required')
# if not args.o:
#     raise ValueError('Output dir path is required')
    
    
# input_file = os.path.join(args.i, 'pairs.jsonl')
# output_file = os.path.join(args.o, 'answers.jsonl')

input_file = 'input.jsonl'
output_file = 'answers2.jsonl'
print("Writing answers to:", output_file , file=sys.stdout, flush=True)


with open(MODEL_FILE, 'rb') as f:
    clf, transformer, scaler, secondary_scaler = pickle.load(f)

with open(input_file, 'r') as f, open(output_file, 'w') as f_output_file:
    i = 0
    for l in tqdm(f):
        if i % 100 == 0:
            print(i, flush=True)
        i += 1
        d = json.loads(l)
        idx = d['id']
        preprocessed_doc1 = prepare_entry(d['text1'], mode='accurate', tokenizer='casual')
        preprocessed_doc2 = prepare_entry(d['text2'], mode='accurate', tokenizer='casual')
        process_single_entry(transformer, scaler, secondary_scaler, clf, idx, preprocessed_doc1, preprocessed_doc2, f_output_file) 
        
print("Execution complete", file=sys.stderr)
            

Writing answers to: answers2.jsonl


0it [00:00, ?it/s]

0


5it [00:00, 54.54it/s]

1.0
0.9999999999999989
1.0
3.9468433133058235e-17
1.0



Execution complete


In [32]:
import numpy as np
import json
from tqdm import tqdm

# Define your custom transformers and get_transformer function here
# ...

def process_single_entry(transformer, scaler, secondary_scaler, clf, idx, preprocessed_doc1, preprocessed_doc2, f_output_file):
    try:
        X1 = np.asarray(transformer.transform([preprocessed_doc1]).todense())
        X2 = np.asarray(transformer.transform([preprocessed_doc2]).todense())
        
        X1 = scaler.transform(X1)
        X2 = scaler.transform(X2)
        
        X = secondary_scaler.transform(np.abs(X1 - X2))
        
        prob = clf.predict_proba(X)[0, 1]
        explanation = clf.coef_.flatten().tolist()  # Get the coefficients as the feature importances
    except Exception as e:
        print('Exception predicting:', e)
        prob = 0.5
        explanation = None

    d = {
        'id': idx,
        'value': prob,
        'explanation': explanation
    }
    
    json.dump(d, f_output_file)
    f_output_file.write('\n')

def process_file(file_path, transformer, scaler, secondary_scaler, clf, output_file_path):
    with open(file_path, 'r') as f, open(output_file_path, 'w') as f_output_file:
        for i, l in enumerate(tqdm(f, desc="Processing entries")):
            if i % 100 == 0:
                print(i, flush=True)

            d = json.loads(l)
            idx = d['id']
            preprocessed_doc1 = prepare_entry(d['text1'], mode='accurate', tokenizer='casual')
            preprocessed_doc2 = prepare_entry(d['text2'], mode='accurate', tokenizer='casual')
            process_single_entry(transformer, scaler, secondary_scaler, clf, idx, preprocessed_doc1, preprocessed_doc2, f_output_file)

# Example usage
file_path = 'input.jsonl'
output_file_path = 'answers1.jsonl'

# Call process_file with appropriate arguments
process_file(file_path, transformer, scaler, secondary_scaler, clf, output_file_path)


Processing entries: 0it [00:00, ?it/s]

0


Processing entries: 5it [00:00, 15.82it/s]


In [35]:
# import numpy as np
# from sklearn.pipeline import FeatureUnion

# def get_all_feature_names(feature_union):
#     feature_names = []
#     for name, transformer in feature_union.transformer_list:
#         if hasattr(transformer, 'get_feature_names_out'):
#             feature_names.extend(transformer.get_feature_names_out())
#         else:
#             feature_names.append(name)
#     return np.array(feature_names)

# # Assuming your FeatureUnion is called `transformer`
# all_feature_names = get_all_feature_names(transformer)

# # Calculate feature importances and map them to feature names
# feature_importances = clf.coef_.flatten()
# importance_dict = dict(zip(all_feature_names, feature_importances))

# # Printing feature importances
# for feature, importance in importance_dict.items():
#     print(f"Feature: {feature}, Importance: {importance}")


In [36]:
# import numpy as np
# from sklearn.pipeline import FeatureUnion

# def get_all_feature_names(feature_union):
#     feature_names = []
#     for name, transformer in feature_union.transformer_list:
#         if hasattr(transformer, 'get_feature_names_out'):
#             feature_names.extend(transformer.get_feature_names_out())
#         else:
#             feature_names.append(name)
#     return np.array(feature_names)

# # Assuming your FeatureUnion is called `transformer`
# all_feature_names = get_all_feature_names(transformer)

# # Calculate feature importances and map them to feature names
# feature_importances = clf.coef_.flatten()
# importance_dict = dict(zip(all_feature_names, feature_importances))

# # Sorting the features by their importance
# sorted_features = sorted(importance_dict.items(), key=lambda item: abs(item[1]), reverse=True)

# # Getting the top 10 most important features
# top_10_features = sorted_features[:10]

# # Printing the top 10 most important features
# for feature, importance in top_10_features:
#     print(f"Feature: {feature}, Importance: {importance}")


Feature: specifying, Importance: 5.5193610191345215
Feature: .., Importance: -2.052586317062378
Feature: entropy, Importance: 1.7252956628799438
Feature: viz, Importance: -1.7221406698226929
Feature: whereafter, Importance: -1.5986037254333496
Feature: : NN, Importance: -1.3982906341552734
Feature: just, Importance: -1.3504716157913208
Feature:  i', Importance: -1.3381776809692383
Feature: nv, Importance: -1.2608269453048706
Feature: o,, Importance: 1.256258487701416


In [37]:
import numpy as np
from sklearn.pipeline import FeatureUnion

def get_all_feature_names(feature_union):
    feature_names = []
    for name, transformer in feature_union.transformer_list:
        if hasattr(transformer, 'get_feature_names_out'):
            feature_names.extend(transformer.get_feature_names_out())
        else:
            feature_names.append(name)
    return np.array(feature_names)

# Assuming your FeatureUnion is called `transformer`
all_feature_names = get_all_feature_names(transformer)

# Calculate feature importances and map them to feature names
feature_importances = clf.coef_.flatten()
importance_dict = dict(zip(all_feature_names, feature_importances))

# Sorting the features by their absolute importance
sorted_features = sorted(importance_dict.items(), key=lambda item: abs(item[1]), reverse=True)

# Getting the top 10 most important features
top_10_features = sorted_features[:10]

print("Top 10 Features (Absolute Importance):")
for feature, importance in top_10_features:
    print(f"Feature: {feature}, Importance: {importance}")

# Separating positive and negative importances
positive_importances = [(feature, importance) for feature, importance in sorted_features if importance > 0]
negative_importances = [(feature, importance) for feature, importance in sorted_features if importance < 0]

print("\nTop 5 Positive Features:")
for feature, importance in positive_importances[:5]:
    print(f"Feature: {feature}, Importance: {importance}")

print("\nTop 5 Negative Features:")
for feature, importance in negative_importances[:5]:
    print(f"Feature: {feature}, Importance: {importance}")


Top 10 Features (Absolute Importance):
Feature: specifying, Importance: 5.5193610191345215
Feature: .., Importance: -2.052586317062378
Feature: entropy, Importance: 1.7252956628799438
Feature: viz, Importance: -1.7221406698226929
Feature: whereafter, Importance: -1.5986037254333496
Feature: : NN, Importance: -1.3982906341552734
Feature: just, Importance: -1.3504716157913208
Feature:  i', Importance: -1.3381776809692383
Feature: nv, Importance: -1.2608269453048706
Feature: o,, Importance: 1.256258487701416

Top 5 Positive Features:
Feature: specifying, Importance: 5.5193610191345215
Feature: entropy, Importance: 1.7252956628799438
Feature: o,, Importance: 1.256258487701416
Feature: squ, Importance: 1.1695750951766968
Feature: e., Importance: 1.1690033674240112

Top 5 Negative Features:
Feature: .., Importance: -2.052586317062378
Feature: viz, Importance: -1.7221406698226929
Feature: whereafter, Importance: -1.5986037254333496
Feature: : NN, Importance: -1.3982906341552734
Feature: just,

In [40]:
import json
import numpy as np
from tqdm import tqdm

def get_top_features(differences, coef, feature_names, top_n=10):
    # Calculate the importance of each feature by multiplying differences with the coefficients
    importances = np.abs(differences * coef)
    top_indices = np.argsort(importances)[-top_n:][::-1]
    top_features = [(feature_names[i], float(importances[i])) for i in top_indices]  # Convert to native float
    return top_features

def process_single_entry(transformer, scaler, secondary_scaler, clf, idx, preprocessed_doc1, preprocessed_doc2, f_output_file, all_feature_names):    
    try:
        X1 = scaler.transform(np.asarray(transformer.transform([preprocessed_doc1]).todense()))
        X2 = scaler.transform(np.asarray(transformer.transform([preprocessed_doc2]).todense()))
        differences = np.abs(X1 - X2)
        X = secondary_scaler.transform(differences)
        prob = clf.predict_proba(X)[0, 1]

        # Extract top features for the current comparison
        top_features = get_top_features(differences.flatten(), clf.coef_.flatten(), all_feature_names, top_n=10)

    except Exception as e:
        print('Exception predicting:', e)
        prob = 0.5
        top_features = []

    d = {
        'id': idx,
        'value': float(prob),  # Convert to native float
        'top_features': [{'feature': feature, 'importance': importance} for feature, importance in top_features]
    }
    
    json.dump(d, f_output_file)
    f_output_file.write('\n')

def main(input_file, output_file, transformer, scaler, secondary_scaler, clf):
    all_feature_names = get_all_feature_names(transformer)

    with open(input_file, 'r') as f_input_file, open(output_file, 'w') as f_output_file:
        for i, line in enumerate(tqdm(f_input_file, desc="Processing entries")):
            d = json.loads(line)
            idx = d['id']
            preprocessed_doc1 = prepare_entry(d['text1'], mode='accurate', tokenizer='casual')
            preprocessed_doc2 = prepare_entry(d['text2'], mode='accurate', tokenizer='casual')
            process_single_entry(transformer, scaler, secondary_scaler, clf, idx, preprocessed_doc1, preprocessed_doc2, f_output_file, all_feature_names)

def get_all_feature_names(feature_union):
    feature_names = []
    for name, transformer in feature_union.transformer_list:
        if hasattr(transformer, 'get_feature_names_out'):
            feature_names.extend(transformer.get_feature_names_out())
        else:
            feature_names.append(name)
    return np.array(feature_names)

# Example usage
input_file = 'input.jsonl'
output_file = 'output.jsonl'
main(input_file, output_file, transformer, scaler, secondary_scaler, clf)


Processing entries: 5it [00:00, 45.86it/s]
