In [None]:
import random
import json

# Sample player names for generation
players = [
    "Harry Kane", "Mason Mount", "João Félix", "Kylian Mbappé", 
    "Jude Bellingham", "Declan Rice", "Erling Haaland", "Vinícius Júnior",
    "Kevin De Bruyne", "Riyad Mahrez"
]

# Sample club names for generation
clubs = [
    "Manchester United", "Chelsea", "Barcelona", "PSG", "Borussia Dortmund", 
    "Real Madrid", "Liverpool", "Tottenham", "Bayern Munich", "Atletico Madrid"
]

# Sample transfer fees for generation
transfer_fees = [
    "£60 million", "€100 million", "€45 million", "£105 million", "€103 million", 
    "£50 million", "€75 million", "£85 million", "€120 million", "£95 million"
]

transfer_types = [
    "loan", "permanent transfer", "free transfer", 
]

# Sentence templates 
templates = [
    ("{player} has signed for {to_club} from {from_club} for {fee}.",
     ["PLAYER_NAME", "TO_CLUB", "FROM_CLUB", "TRANSFER_FEE"]),
    
    ("{to_club} complete the transfer of {player} from {from_club} at a fee of {fee}.",
     ["TO_CLUB", "PLAYER_NAME", "FROM_CLUB", "TRANSFER_FEE"]),
    
    ("{player} moves to {to_club} on a {transfer_type} from {from_club}.",
     ["PLAYER_NAME", "TO_CLUB", "TRANSFER_TYPE", "FROM_CLUB"]),
    
    ("{player} joins {to_club} from {from_club}.",
     ["PLAYER_NAME", "TO_CLUB", "FROM_CLUB"]),
    
    ("{to_club} announce the signing of {player} on a {transfer_type}.",
     ["TO_CLUB", "PLAYER_NAME", "TRANSFER_TYPE"]),

    ("{player} transfers to {to_club} coming from {from_club} with a fee of {fee}.",
     ["PLAYER_NAME", "TO_CLUB", "FROM_CLUB", "TRANSFER_FEE"]),
    
    ("{player} has been loaned to {to_club} from {from_club} on a {transfer_type}.",
     ["PLAYER_NAME", "TO_CLUB", "FROM_CLUB", "TRANSFER_TYPE"]),
    
    ("The transfer deal of {player} from {from_club} to {to_club} costs {fee}.",
     ["PLAYER_NAME", "FROM_CLUB", "TO_CLUB", "TRANSFER_FEE"])
]

def create_annotation(sentence, entity_text, label, start_search=0):
    """
    Find start and end character positions of entity_text in sentence,
    starting search from start_search index.
    """
    start = sentence.find(entity_text, start_search)
    if start == -1:
        raise ValueError(f"Entity '{entity_text}' not found in sentence.")
    end = start + len(entity_text)
    return (start, end, label), end

def generate_sentence_and_annotation():
    # Randomly choose template and fill sentence templates
    template, labels = random.choice(templates)
    
    player = random.choice(players)
    to_club = random.choice(clubs)
    from_club = random.choice([club for club in clubs if club != to_club])
    fee = random.choice(transfer_fees)
    transfer_type = random.choice(transfer_types)
    
    filled = template.format(
        player=player,
        to_club=to_club,
        from_club=from_club,
        fee=fee,
        transfer_type=transfer_type
    )
    
    entities = []
    search_pos = 0
    
    # Map label to actual text in sentence
    label_to_text = {
        "PLAYER_NAME": player,
        "TO_CLUB": to_club,
        "FROM_CLUB": from_club,
        "TRANSFER_FEE": fee,
        "TRANSFER_TYPE": transfer_type
    }
    
    for label in labels:
        entity_text = label_to_text[label]
        ann, search_pos = create_annotation(filled, entity_text, label, search_pos)
        entities.append(ann)
    
    return filled, {"entities": entities}

# Generate 500 unique samples
train_data_set = set()
attempts = 0
while len(train_data_set) < 500 and attempts < 2000:
    sent, ann = generate_sentence_and_annotation()
    
    key = (sent, tuple(ann["entities"]))
    if key not in train_data_set:
        train_data_set.add(key)
    attempts += 1

print(f"Generated {len(train_data_set)} unique samples after {attempts} attempts.")

# Convert set back to list of dicts
train_data = [{"text": s, "entities": list(e)} for s, e in train_data_set]

# Save to JSON file
with open("train_data.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, indent=2)

print("Saved 20 generated sentences with annotations to 'train_data.json'")

Generated 500 unique samples after 511 attempts.
Saved 20 generated sentences with annotations to 'train_data.json'
