In [None]:
!pip install transformers datasets accelerate bitsandbytes peft torch pandas scikit-learn huggingface_hub

In [None]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
from huggingface_hub import login
from peft import get_peft_model, LoraConfig, TaskType
import torch
from google.colab import files
import io

In [None]:
#connect to drive and read csv files
from google.colab import drive

drive.mount('/content/drive')

history_path = "/content/drive/My Drive/history.csv"
future_path = "/content/drive/My Drive/upcoming.csv"

history_df = pd.read_csv(history_path, encoding="utf-8", on_bad_lines='warn')
future_df = pd.read_csv(future_path, encoding="utf-8", on_bad_lines='warn')

#check a few lines
print("History Data:")
print(history_df.head())

print("\nFuture Matches Data:")
print(future_df.head())


In [None]:
#process historical match data
def process_history_data(row):
    """Process historical match data with known results"""
    try:
        if 'ft_score' in row:
            scores = row['ft_score'].split('-')
            home_score = int(scores[0].strip())
            away_score = int(scores[1].strip())

            #determine result: 1 for home win, 0 for draw, 2 for away win
            if home_score > away_score:
                result = 1  #home win
            elif home_score == away_score:
                result = 0  #draw
            else:
                result = 2  #away win
        elif 'result' in row:
            #if result is directly available
            result = int(row['result'])
        else:
            #default to placeholder if no result info is available
            result = -1
            print(f"Warning: No result information for match {row['home_name']} vs {row['away_name']}")
    except Exception as e:
        print(f"Error processing match {row['home_name']} vs {row['away_name']}: {e}")
        result = -1

    input_text = (
        f"{row['home_name']} vs {row['away_name']} - "
        f"(Odds: 1={row['o_pre_1']}, 0={row['o_pre_0']}, 2={row['o_pre_2']})"
    )
    return {"input": input_text, "label": result}


history_dataset = [process_history_data(row) for _, row in history_df.iterrows()]
print(f"Processed {len(history_dataset)} historical matches")

In [None]:
#process future match data
def process_future_data(row):
    """Process future match data with unknown results"""
    odds_dict = {}

    #collect all available odds
    for col in row.index:
        if col.startswith('o_') and not pd.isna(row[col]):
            odds_dict[col] = row[col]

    #format the odds string
    odds_parts = []

    #add basic match odds
    if all(k in odds_dict for k in ['o_ft_1', 'o_ft_0', 'o_ft_2']):
        odds_parts.append(f"1={odds_dict['o_ft_1']}, 0={odds_dict['o_ft_0']}, 2={odds_dict['o_ft_2']}")

    #add GG odds if available
    if all(k in odds_dict for k in ['o_kg_v', 'o_kg_y']):
        odds_parts.append(f"GG Yes={odds_dict['o_kg_v']}, GG No={odds_dict['o_kg_y']}")

    #add double chance odds if available
    if all(k in odds_dict for k in ['o_double_1_0', 'o_double_1_2', 'o_double_0_2']):
        odds_parts.append(f"DC 1-0={odds_dict['o_double_1_0']}, 1-2={odds_dict['o_double_1_2']}, 0-2={odds_dict['o_double_0_2']}")

    #add total goals odds if available
    total_goals = [k for k in odds_dict if k.startswith('o_total_')]
    if total_goals:
        total_parts = []
        for k in total_goals:
            label = k.replace('o_total_', '')
            total_parts.append(f"{label}={odds_dict[k]}")
        odds_parts.append("Goals: " + ", ".join(total_parts))

    #combine all parts
    input_text = f"{row['home_name']} vs {row['away_name']} - (Odds: {'; '.join(odds_parts)})"

    return {"input": input_text, "label": -1}  #use -1 to indicate unknown result


future_dataset = [process_future_data(row) for _, row in future_df.iterrows()]
print(f"Processed {len(future_dataset)} future matches")

In [None]:
#create datasets
#split historical data for training and validation
train_data, val_data = train_test_split(history_dataset, test_size=0.2, random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data),
    "future": Dataset.from_list(future_dataset)  #keep future matches separate
})

print(f"Train set: {len(dataset['train'])} examples")
print(f"Validation set: {len(dataset['validation'])} examples")
print(f"Future set: {len(dataset['future'])} examples")

In [None]:
#hugging Face login and model setup

from huggingface_hub import login
#sign in hugging face
login(token="YOUR_TOKEN")

model_name = "meta-llama/Meta-Llama-3-8B"

print(f"Using model: {model_name}")

In [None]:
#setup model and tokenizer
def setup_model_and_tokenizer(model_name):
    """Setup the model with 4-bit quantization and LoRA"""
    print(f"Loading model: {model_name}...")

    #4-bit quantization configuration
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16  #changed to float16 for Colab compatibility
    )

    #load model with 4-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"
    )

    #LoRA adapter configuration - adjust target_modules based on model architecture
    #different models have different module names
    if "llama" in model_name.lower():
        target_modules = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    elif "phi" in model_name.lower():
        target_modules = ["q_proj", "v_proj", "k_proj", "o_proj", "fc1", "fc2"]
    else:
        target_modules = ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]

    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=8,  #LoRA rank
        lora_alpha=32,  #LoRA alpha
        lora_dropout=0.1,  #dropout rate
        target_modules=target_modules
    )

    #add LoRA adapter
    model = get_peft_model(model, lora_config)

    #load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print("Model and tokenizer loaded successfully!")
    return model, tokenizer

#initialize model and tokenizer
model, tokenizer = setup_model_and_tokenizer(model_name)

In [None]:
#preprocess data for the model
def preprocess_data(examples, tokenizer):
    """Process examples in batches with tokenizer"""
    #format inputs with instruction prompt
    formatted_inputs = [
        f"Analyze this football match and predict the best betting option (1 for home win, 0 for draw, 2 for away win): {example}"
        for example in examples["input"]
    ]

    #tokenize inputs
    model_inputs = tokenizer(
        formatted_inputs,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors=None  #changed this from "pt" to None - critical fix
    )

    #handle labels
    if "label" in examples:
        labels = examples["label"]
        #ensure all labels are properly formatted for the loss function
        labels = [int(label) if label != -1 else -100 for label in labels]
        model_inputs["labels"] = labels

    return model_inputs

#process datasets
print("Processing datasets...")
processed_dataset = {}
for split in ["train", "validation"]:
    processed_dataset[split] = dataset[split].map(
        lambda examples: preprocess_data(examples, tokenizer),
        batched=True,
        remove_columns=dataset[split].column_names
    )
print("Datasets processed successfully!")

In [None]:
#training setup and execution
def train_model(processed_dataset, model, tokenizer):
    """Train the model on the dataset"""
    #print some information about the processed datasets
    print(f"Training dataset size: {len(processed_dataset['train'])}")
    print(f"Validation dataset size: {len(processed_dataset['validation'])}")

    #check if datasets have the right format
    example_input = processed_dataset["train"][0]
    print(f"Example training input keys: {example_input.keys()}")

    #training arguments - adjusted for Colab
    training_args = TrainingArguments(
        output_dir="./betting-model",
        evaluation_strategy="epoch",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        per_device_eval_batch_size=1,
        num_train_epochs=2,
        save_strategy="epoch",
        save_total_limit=1,
        push_to_hub=False,
        logging_dir="./logs",
        logging_steps=10,
        fp16=True,
        optim="adamw_torch",
        #added parameters to help with debugging
        remove_unused_columns=False,  #important fix to retain all columns
        report_to="none",  #disable wandb and other reporting
        disable_tqdm=False  #show progress bars
    )

    #initialize trainer with a data collator
    from transformers import DataCollatorForLanguageModeling

    #use a proper data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # We're not doing masked language modeling
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=processed_dataset["train"],
        eval_dataset=processed_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator  #added data collator
    )

    #train model
    print("Starting training...")
    trainer.train()

    #save model
    print("Saving model...")
    trainer.save_model("./betting-model-final")
    print("Model saved to ./betting-model-final")

    return trainer

#train the model
trainer = train_model(processed_dataset, model, tokenizer)