In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import json
import re
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from tqdm.auto import tqdm


class QaModel:
    def __init__(self, model_name, num_answers=1):
        self.model_name = model_name
        self.num_answers = num_answers
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_name)
        self.qa_pipeline = pipeline("question-answering", model=self.model, tokenizer=self.tokenizer, top_k=self.num_answers)

    def predict(self, question, context):
        return self.qa_pipeline(question=question, context=context)

def get_phrase(row, model_phrase):
    question = row.get('postText')[0]
    context = ' '.join(row.get('targetParagraphs'))
    result = model_phrase.predict(question, context)
    print(f"get_phrase result: {result}")  # Debugging statement
    answer = result['answer'] if 'answer' in result else ''
    return [answer]

def get_passage(row, model_passage):
    question = row.get('postText')[0]
    context = ' '.join(row.get('targetParagraphs'))
    result = model_passage.predict(question, context)
    print(f"get_passage result: {result}")  # Debugging statement
    answer = result['answer'] if 'answer' in result else ''
    candidates = [sentence.strip() for sentence in context.split('.') if answer in sentence]
    return [candidates[0] if candidates else '']

def get_multi(row, model_multi):
    question = row.get('postText')[0]
    context = ' '.join(row.get('targetParagraphs'))
    current_context = context
    results = []
    for _ in range(5):
        result = model_multi.predict(question, current_context)
        print(f"get_multi result: {result}")  # Debugging statement
        current_result = result['answer'] if 'answer' in result else ''
        if not current_result:
            break
        results.append(current_result)
        current_context = re.sub(re.escape(current_result), '', current_context, count=1)
    return results

def predict(inputs, model_phrase, model_passage, model_multi):
    results = []
    for row in tqdm(inputs):
        if row.get('spoilerType') == 'phrase':
            answer = get_phrase(row, model_phrase)
        elif row.get('spoilerType') == 'passage':
            answer = get_passage(row, model_passage)
        elif row.get('spoilerType') == 'multi':
            answer = get_multi(row, model_multi)
        else:
            print("Unknown spoiler type")
            answer = [""]

        results.append({'id': row['id'], 'spoiler': answer})
    return results

def run_baseline(input_file, output_file, model_phrase, model_passage, model_multi):
    with open(input_file, 'r') as inp, open(output_file, 'w') as out:
        inputs = [json.loads(line) for line in inp]
        predictions = predict(inputs, model_phrase, model_passage, model_multi)
        for prediction in predictions:
            out.write(json.dumps(prediction) + '\n')

if __name__ == '__main__':
    # Load the data
    train_data = pd.read_json('/content/drive/MyDrive/task-1/data/train.jsonl', lines=True)
    val_data = pd.read_json('/content/drive/MyDrive/task-1/data/val.jsonl', lines=True)
    test_data = pd.read_json('/content/drive/MyDrive/task-1/data/test.jsonl', lines=True)

    # Add spoilerType from task 1 predictions to validation and test datasets
    task1_predictions = pd.read_csv('/content/drive/MyDrive/task-1/data/prediction_task1.csv')
    val_data = val_data.merge(task1_predictions[['id', 'spoilerType']], on='id', how='left')
    test_data = test_data.merge(task1_predictions[['id', 'spoilerType']], on='id', how='left')

    # Save the merged test data to a new file
    test_data.to_json('/content/drive/MyDrive/task-1/data/test-merged.jsonl', lines=True, orient='records')

    model_phrase = QaModel("deepset/roberta-base-squad2")
    model_passage = QaModel("deepset/roberta-base-squad2")
    model_multi = QaModel("deepset/roberta-base-squad2", num_answers=5)

    input_file = '/content/drive/MyDrive/task-1/data/test-merged.jsonl'
    output_file = '/content/drive/MyDrive/task-2/data/prediction_task2.jsonl'

    run_baseline(input_file, output_file, model_phrase=model_phrase, model_passage=model_passage, model_multi=model_multi)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  0%|          | 0/400 [00:00<?, ?it/s]

get_phrase result: {'score': 0.237760528922081, 'start': 813, 'end': 821, 'answer': 'balloons'}
get_multi result: [{'score': 0.03794412687420845, 'start': 3002, 'end': 3043, 'answer': 'selfless people suffer most in workplaces'}, {'score': 0.024626437574625015, 'start': 92, 'end': 142, 'answer': 'putting your needs before those of your colleagues'}, {'score': 0.01864650286734104, 'start': 3002, 'end': 3024, 'answer': 'selfless people suffer'}, {'score': 0.017416542395949364, 'start': 283, 'end': 323, 'answer': 'damages your chance of long-term success'}, {'score': 0.017056172713637352, 'start': 3002, 'end': 3081, 'answer': 'selfless people suffer most in workplaces and find themselves drowning in admin'}]
get_passage result: {'score': 0.017889557406306267, 'start': 0, 'end': 65, 'answer': 'Meditating inside a beautiful stock-photo room filled with plants'}
get_phrase result: {'score': 0.273841917514801, 'start': 277, 'end': 285, 'answer': 'Braconid'}
get_passage result: {'score': 0.001

In [1]:


# Convert JSONL to CSV
def jsonl_to_csv(jsonl_file, csv_file):
    with open(jsonl_file, 'r') as infile:
        data = [json.loads(line) for line in infile]

    df = pd.DataFrame(data)
    df['spoiler'] = df['spoiler'].apply(lambda x: ' '.join(x) if x else '')

    # Save to CSV with proper quoting
    df.to_csv(csv_file, index=False, quoting=csv.QUOTE_ALL)

# Convert the output JSONL file to CSV
jsonl_file = '/content/drive/MyDrive/task-2/data/prediction_task2.jsonl'
csv_file = '/content/drive/MyDrive/task-2/data/prediction_task2.csv'
jsonl_to_csv(jsonl_file, csv_file)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/task-2/data/prediction_task2.jsonl'