In [1]:
import transformers
import torch 
import gc 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def chunk_text(text, tokenizer, max_length=500):
    """
    This function chunks the input text into smaller pieces that fit the model's token limit.
    """
    # Tokenize the entire text and get input IDs
    inputs = tokenizer.encode(text, add_special_tokens=False)  # Do not add [CLS] and [SEP] tokens yet
    
    # Split the tokens into chunks of size max_length - 2 to account for [CLS] and [SEP]
    chunks = [inputs[i:i + (max_length - 2)] for i in range(0, len(inputs), max_length - 2)]
    
    # Reconstruct the chunks into proper inputs by adding special tokens ([CLS], [SEP])
    chunked_texts = [tokenizer.decode([tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id], 
                                      skip_special_tokens=False) for chunk in chunks]
    
    return chunked_texts


def aggregate_results(results):
    # Initialize a dictionary to store the sum of scores for each label
    total_scores = {'LEFT': 0, 'CENTER': 0, 'RIGHT': 0}
    
    # Sum the scores for each label across all chunks
    for result in results:
        for bias in result:
            
            if bias['label'] == 'LEFT':
                total_scores['LEFT'] += bias['score']
            elif bias['label'] == 'RIGHT':
                total_scores['RIGHT'] += bias['score']
            else: 
                total_scores['CENTER'] += bias['score']
                
    # Calculate the average score for each label
    for label in total_scores:
        total_scores[label] /= len(results)
    
    return total_scores


In [3]:
model_id = 'bucketresearch/politicalBiasBERT'
access_token = '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'

In [4]:
gc.collect()
torch.cuda.empty_cache()

In [5]:
# Load the model and tokenizer with the access token
model = transformers.TFAutoModelForSequenceClassification.from_pretrained(
    model_id,
    token=access_token
)
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    token=access_token,
    truncation=True
)

# Create the pipeline with the specified model and tokenizer
pipeline = transformers.pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
    device=0 if torch.cuda.is_available() else -1
)

2024-10-31 17:51:32.354753: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-31 17:51:32.370685: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-31 17:51:32.390054: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-31 17:51:32.396021: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-31 17:51:32.410068: I tensorflow/core/platform/cpu_feature_guar

In [6]:
with open('output.txt','r') as f:
  original_content = f.read()


In [7]:
with open('mistral_output.txt', 'r') as file:
    modified_content = file.read()

In [8]:
original_chunks = chunk_text(original_content, tokenizer)
modified_chunks = chunk_text(modified_content, tokenizer)

In [9]:
print(original_chunks[0])
print(original_chunks[1])
print(len(original_chunks))

[CLS] the house leadership has started debating thursday on a border security bill to address the immigration crisis, but conservatives, led by texas sen. ted cruz, say they may refuse to sign - on without language that would put an end to what they call the ` ` obama amnesty ' ' law. cruz convened a meetin g wednesday night with 11 rank - and - file house members to try to persuade them to vote against house speaker john boehner ' s version of the bill on the basis that the measure does not defund president barack obama ' s deferred action for child arrivals [ daca ] executive order. the current leadership b ill leaves daca intact but simply prohibits any official policies to expand its scope, a position conservatives say is not acceptable. speaker john bo ehner gave indications he might give cruz a concession by adding a vote on a second bill on thursday. ` ` i have been speaking with members in both ho uses who have an interest in my views, ' ' cruz told reuters late on wednesday. t

In [10]:
# Batch processing for original_content
original_results = pipeline(original_chunks, batch_size=4)

# Batch processing for modified_content
modified_results = pipeline(modified_chunks, batch_size=4)

TypeError: _batch_encode_plus() got an unexpected keyword argument 'temperature'

In [10]:
print(original_results)

[[{'label': 'LEFT', 'score': 0.09173677116632462}, {'label': 'CENTER', 'score': 0.1815633773803711}, {'label': 'RIGHT', 'score': 0.7266998291015625}], [{'label': 'LEFT', 'score': 0.08783844113349915}, {'label': 'CENTER', 'score': 0.7113876342773438}, {'label': 'RIGHT', 'score': 0.20077389478683472}]]


In [12]:
print(modified_results)

[[{'label': 'LEFT', 'score': 0.897503137588501}, {'label': 'CENTER', 'score': 0.06844838708639145}, {'label': 'RIGHT', 'score': 0.03404853492975235}], [{'label': 'LEFT', 'score': 0.6523699760437012}, {'label': 'CENTER', 'score': 0.24558360874652863}, {'label': 'RIGHT', 'score': 0.10204644501209259}]]


In [17]:
original_results = aggregate_results(original_results)
modified_resutls = aggregate_results(modified_results)

print('origianl text: ', original_results)
print('modified_content:', modified_resutls)

origianl text:  {'LEFT': 0.08978760614991188, 'CENTER': 0.4464755058288574, 'RIGHT': 0.4637368619441986}
modified_content: {'LEFT': 0.7749365568161011, 'CENTER': 0.15701599791646004, 'RIGHT': 0.06804748997092247}
