In [None]:
import datasets

features = datasets.Features({
  "website": datasets.Value("string"),
  "source_url": datasets.Value("string"),
  "scraped_on": datasets.Value("string"),
  "title": datasets.Value("string"),
  "content": datasets.Value("string"),
  "published_date": datasets.Value("string")
})

ds = datasets.load_dataset("eranmazur/raw-crossfit", split="train", features=features)

In [None]:
ds.num_rows

In [None]:
with open("terms.txt") as file:
  crossfit_terms = [word.lstrip().rstrip().strip().lower() for word in file]

def recognize_crossfit_terms(text):
    recognized_terms = set()

    for term in crossfit_terms:
        if term in text.lower():
            recognized_terms.add(term)

    return list(recognized_terms)

In [None]:
ds_with_terms = ds.filter(lambda example: len(recognize_crossfit_terms(example["content"])) > 0)

In [None]:
ds_with_terms.num_rows

In [None]:
from transformers import LlamaTokenizerFast

tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")

total_tokens = 0

for sample in ds:
  total_tokens += len(tokenizer.encode(sample["content"]))

total_tokens

# Text Replacements

In [None]:
import re

markdown_embedded_link_pattern = re.compile(r'\[([^\[\]]+)\]\(([^()]+)\)')

def replace_link_pattern(markdown_text):
    # Define a replacement function to replace the matched pattern
    def replacer(match):
        return match.group(1)  # Return the text inside [text]
    
    # Use re.sub to replace all occurrences of the pattern
    replaced_text = re.sub(markdown_embedded_link_pattern, replacer, markdown_text)
    
    return replaced_text

href_pattern = re.compile(r'<a\s+href="([^"]*)"\s*[^>]*>(.*?)<\/a>')

def replace_href_with_text(html_text):
    replaced_text = re.sub(href_pattern, r'\2', html_text)
    return replaced_text

removal_pattern = re.compile(r'(Post.*?(?:\.|\n|\,|$)|Compare.*?(?:\.|\n|\,|$))')

def remove_patterns(input_string):
    output_string = re.sub(removal_pattern, '', input_string)
    return output_string

def remove_lines_with_keywords(input_string):
    # Define the keywords to match
    keywords = ['comments', 'Share', 'Compare', 'Post']

    # Create the regex pattern to match entire lines containing any of the keywords
    pattern = r'^.*?(?:' + '|'.join(re.escape(keyword) for keyword in keywords) + r').*?$'

    # Use re.MULTILINE flag to apply the pattern to each line
    output_string = re.sub(pattern, '', input_string, flags=re.MULTILINE)
    return output_string

def remove_lines_with_more_than_7_words(input_string):
    # Split the input string into lines
    lines = input_string.split('\n')

    # Filter lines with more than 7 words
    filtered_lines = [line for line in lines if len(line.split()) <= 7]

    # Join the filtered lines back into a string
    output_string = '\n'.join(filtered_lines)

    return output_string

In [None]:
def content_replacements(example):
  # Replace markdown embedded links
  example["content"] = replace_link_pattern(example["content"])

  # Replace anchor tags
  example["content"] = replace_href_with_text(example["content"])

  # Removal patterns
  example["content"] = remove_patterns(example["content"])

  # Remove lines specific keywords
  example["content"] = remove_lines_with_keywords(example["content"])

  # Remove lines with more than 7 words
  example["content"] = remove_lines_with_more_than_7_words(example["content"])

  # Strip content
  example["content"] = example["content"].strip()

  return example

ds = ds_with_terms.map(content_replacements)