### Filtering for relevant news articles

In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import os

# Load the tokenizer and model from Hugging Face
model_name = "MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)


In [27]:
def classify_text(text, candidate_labels):
    # Use the pipeline for zero-shot classification
    results = classifier(text, candidate_labels, multi_label=False)
    # Organize results into a dictionary where each label is mapped to its corresponding score
    return {label: score for label, score in zip(results['labels'], results['scores'])}


def filter_files(file_paths, relevant_label="relevant football article including at least one national team"):
    print("Filtering for relevant football arcticles...")
    relevant_files = []
    candidate_labels = ["relevant football article including at least one national team", "irrelevant content"]

    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
        
            results = classify_text(text, candidate_labels)
            print(results)
            if results[relevant_label] > 0.5:  # Adjust the threshold as needed
                relevant_files.append(file_path)
        except Exception as e:
            # Code to handle the exception
            # Print the type of exception
            print(f"An error of type {type(e).__name__} occurred while filtering")
    return relevant_files


In [28]:
def get_all_file_paths(directory):
    file_paths = []  # List to store file paths
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
    return file_paths


In [29]:
# Set the directory path to where your files are stored
directory_path = '../../data/raw'
all_file_paths = get_all_file_paths(directory_path)

# Filter the files, assuming you want to check all files in the directory
filtered_files = filter_files(all_file_paths)
print("Filtered files:", filtered_files)


Filtering for relevant football arcticles...
{'relevant football article including at least one national team': 0.5949247479438782, 'irrelevant content': 0.4050752520561218}
{'relevant football article including at least one national team': 0.6402378082275391, 'irrelevant content': 0.3597622513771057}
{'relevant football article including at least one national team': 0.5474096536636353, 'irrelevant content': 0.45259034633636475}
{'relevant football article including at least one national team': 0.6190314292907715, 'irrelevant content': 0.3809686303138733}
{'relevant football article including at least one national team': 0.6223347187042236, 'irrelevant content': 0.37766531109809875}
{'relevant football article including at least one national team': 0.6178963780403137, 'irrelevant content': 0.38210368156433105}
{'relevant football article including at least one national team': 0.584761381149292, 'irrelevant content': 0.415238618850708}
{'relevant football article including at least one 

In [30]:
# Define the file path where you want to save the list
file_path = "../../data/filtered_file_paths.txt"

# Open the file in write mode
with open(file_path, "w") as file:
    # Write each item in the list to the file
    for item in filtered_files:
        file.write("%s\n" % item)