In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [40]:
import pandas as pd
import json
import re
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from tqdm.auto import tqdm


class QaModel:
    def __init__(self, model_name, num_answers=1):
        self.model_name = model_name
        self.num_answers = num_answers
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_name)
        self.qa_pipeline = pipeline("question-answering", model=self.model, tokenizer=self.tokenizer, top_k=self.num_answers)

    def predict(self, question, context):
        return self.qa_pipeline(question=question, context=context)

def get_phrase(row, model_phrase):
    question = row.get('postText')[0]
    context = ' '.join(row.get('targetParagraphs'))
    result = model_phrase.predict(question, context)
    print(f"get_phrase result: {result}")  # Debugging statement
    answer = result['answer'] if 'answer' in result else 'No spoiler found'
    return [answer]

def get_passage(row, model_passage):
    question = row.get('postText')[0]
    context = ' '.join(row.get('targetParagraphs'))
    result = model_passage.predict(question, context)
    print(f"get_passage result: {result}")  # Debugging statement
    answer = result['answer'] if 'answer' in result else 'No spoiler found'
    candidates = [sentence.strip() for sentence in context.split('.') if answer in sentence]
    return [candidates[0] if candidates else 'No spoiler found']

def get_multi(row, model_multi):
    question = row.get('postText')[0]
    context = ' '.join(row.get('targetParagraphs'))
    current_context = context
    results = []
    print(f"Question: {question}")
    print(f"Context: {context[:500]}...")  # Print only the first 500 characters for readability
    for _ in range(5):
        result = model_multi.predict(question, current_context)
        print(f"get_multi result: {result}")  # Debugging statement
        current_result = result['answer'] if 'answer' in result else 'No spoiler found'
        print(f"Current Result: {current_result}")
        if not current_result or current_result == 'No spoiler found':
            break
        results.append(current_result)
        current_context = re.sub(re.escape(current_result), '', current_context, count=1)
    return results if results else ['No spoiler found']

def predict(inputs, model_phrase, model_passage, model_multi):
    results = []
    for row in tqdm(inputs):
        if row.get('spoilerType') == 'phrase':
            answer = get_phrase(row, model_phrase)
        elif row.get('spoilerType') == 'passage':
            answer = get_passage(row, model_passage)
        elif row.get('spoilerType') == 'multi':
            answer = get_multi(row, model_multi)
        else:
            print("Unknown spoiler type")
            answer = ["No spoiler found"]

        results.append({'id': row['id'], 'spoiler': answer})
    return results


def run_baseline(input_file, output_file, model_phrase, model_passage, model_multi):
    with open(input_file, 'r') as inp, open(output_file, 'w') as out:
        inputs = [json.loads(line) for line in inp]
        predictions = predict(inputs, model_phrase, model_passage, model_multi)
        for prediction in predictions:
            out.write(json.dumps(prediction) + '\n')

if __name__ == '__main__':
    # Load the data
    train_data = pd.read_json('/content/drive/MyDrive/task-1/data/train.jsonl', lines=True)
    val_data = pd.read_json('/content/drive/MyDrive/task-1/data/val.jsonl', lines=True)
    test_data = pd.read_json('/content/drive/MyDrive/task-1/data/test.jsonl', lines=True)

    # Add spoilerType from task 1 predictions to test dataset
    task1_predictions = pd.read_csv('/content/drive/MyDrive/task-1/data/prediction_task1.csv')
    test_data = test_data.merge(task1_predictions[['id', 'spoilerType']], on='id', how='left')

    # Save the merged test data to a new file
    test_data.to_json('/content/drive/MyDrive/task-1/data/test-merged.jsonl', lines=True, orient='records')

    model_phrase = QaModel("deepset/roberta-base-squad2")
    model_passage = QaModel("deepset/roberta-base-squad2")
    model_multi = QaModel("deepset/roberta-base-squad2", num_answers=5)

    input_file = '/content/drive/MyDrive/task-1/data/test-merged.jsonl'
    output_file = '/content/drive/MyDrive/task-2/data/prediction_task2.jsonl'

    run_baseline(input_file, output_file, model_phrase=model_phrase, model_passage=model_passage, model_multi=model_multi)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


  0%|          | 0/400 [00:00<?, ?it/s]

get_phrase result: {'score': 0.23776091635227203, 'start': 813, 'end': 821, 'answer': 'balloons'}
Question: Why you SHOULD be selfish at work
Context: We're always being encouraged to help others before we help ourselves. And in the workplace putting your needs before those of your colleagues is often seen as selfish behaviour. But new research says being selfless at work can backfire. Giving at the expense of your own well-being damages your chance of long-term success. In an article for The Harvard Business Review, Wharton School of the University of Pennsylvania professor, Dr Adam Grant and Wharton People Analytics researcher Reb Rebele exp...
get_multi result: [{'score': 0.03794436529278755, 'start': 3002, 'end': 3043, 'answer': 'selfless people suffer most in workplaces'}, {'score': 0.024626515805721283, 'start': 92, 'end': 142, 'answer': 'putting your needs before those of your colleagues'}, {'score': 0.01864660531282425, 'start': 3002, 'end': 3024, 'answer': 'selfless people suf

In [41]:
import json
import pandas as pd
from sklearn.metrics import classification_report, precision_recall_fscore_support

# Ensure that the validation data contains the correct 'spoilerType' field
val_data = pd.read_json('/content/drive/MyDrive/task-1/data/val.jsonl', lines=True)

# If 'spoilerType' is not in the validation data, add it manually (example values used here)
if 'spoilerType' not in val_data.columns:
    # Adding an example 'spoilerType' column for demonstration purposes
    val_data['spoilerType'] = 'phrase'  # Assigning 'phrase' to all rows as an example
    val_data.to_json('/content/drive/MyDrive/task-1/data/val-merged.jsonl', lines=True, orient='records')
else:
    val_data.to_json('/content/drive/MyDrive/task-1/data/val-merged.jsonl', lines=True, orient='records')

# Run the model on the validation set
input_file = '/content/drive/MyDrive/task-1/data/val-merged.jsonl'
output_file = '/content/drive/MyDrive/task-2/data/prediction_task2_val.jsonl'
run_baseline(input_file, output_file, model_phrase=model_phrase, model_passage=model_passage, model_multi=model_multi)



  0%|          | 0/400 [00:00<?, ?it/s]

get_phrase result: {'score': 0.005847423803061247, 'start': 739, 'end': 747, 'answer': 'too dark'}
get_phrase result: {'score': 0.22044825553894043, 'start': 198, 'end': 211, 'answer': 'intentionally'}
get_phrase result: {'score': 0.03511201962828636, 'start': 2176, 'end': 2196, 'answer': 'slightly smaller tip'}
get_phrase result: {'score': 0.04750742390751839, 'start': 8980, 'end': 8994, 'answer': 'Michael Gambon'}
get_phrase result: {'score': 0.0018642231589183211, 'start': 423, 'end': 431, 'answer': 'John Doe'}
get_phrase result: {'score': 0.36991578340530396, 'start': 2203, 'end': 2209, 'answer': 'Sprite'}
get_phrase result: {'score': 0.3737172782421112, 'start': 42, 'end': 50, 'answer': 'midnight'}
get_phrase result: {'score': 0.0011132574873045087, 'start': 2427, 'end': 2499, 'answer': 'she had been teaching at Hogwarts for "Thirty-nine years this December".'}
get_phrase result: {'score': 0.028765099123120308, 'start': 713, 'end': 728, 'answer': 'No, there’s not'}
get_phrase resu

In [44]:
# Evaluate the predictions
def evaluate_predictions(true_file, pred_file):
    with open(true_file, 'r') as true_f, open(pred_file, 'r') as pred_f:
        true_data = [json.loads(line) for line in true_f]
        pred_data = [json.loads(line) for line in pred_f]

    # Extract the first spoiler from the list if it exists, otherwise use an empty string
    true_labels = [entry['spoiler'][0] if entry['spoiler'] else '' for entry in true_data]
    pred_labels = [entry['spoiler'][0] if entry['spoiler'] else '' for entry in pred_data]

    # Print overall precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted',zero_division=0.0)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

# Evaluate the model results for Task 2 using F1-score
evaluate_predictions('/content/drive/MyDrive/task-1/data/val-merged.jsonl', '/content/drive/MyDrive/task-2/data/prediction_task2_val.jsonl')

Precision: 0.2250, Recall: 0.2250, F1-score: 0.2250


In [43]:
import json
import pandas as pd
import csv

# Convert JSONL to CSV
def jsonl_to_csv(jsonl_file, csv_file):
    with open(jsonl_file, 'r') as infile:
        data = [json.loads(line) for line in infile]

    df = pd.DataFrame(data)
    df['spoiler'] = df['spoiler'].apply(lambda x: ' '.join(x) if x else '')

    # Save to CSV with proper quoting
    df.to_csv(csv_file, index=False, quoting=csv.QUOTE_ALL)

# Convert the output JSONL file to CSV
jsonl_file = '/content/drive/MyDrive/task-2/data/prediction_task2.jsonl'
csv_file = '/content/drive/MyDrive/task-2/data/prediction_task2.csv'
jsonl_to_csv(jsonl_file, csv_file)