In [None]:
!pip install --upgrade git+https://github.com/huggingface/transformers.git peft wandb bitsandbytes datasets python-dotenv

In [None]:
from dotenv import load_dotenv
from os import getcwd

load_dotenv(f'{getcwd()}/env')

In [None]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer, 
    TrainerCallback,
    BitsAndBytesConfig,
    EarlyStoppingCallback,
    ModernBertForSequenceClassification
)
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig
import wandb
from datasets import Dataset
import math
from sklearn.metrics import mean_squared_error
import logging
import warnings
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
import os
from torch.nn.utils.rnn import pad_sequence

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('training.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)



In [None]:
@dataclass
class ModelConfig:
    base_model: str = "answerdotai/modernbert-large"
    max_length: int = 3300
    batch_size: int = 3  # Optimized for L4 GPU
    num_labels: int = 6
    learning_rate: float = 1e-4
    weight_decay: float = 0.01
    num_epochs: int = 3
    warmup_ratio: float = 0.1
    gradient_accumulation_steps: int = 2

def setup_environment():
    """Setup GPU environment and optimize settings"""
    if not torch.cuda.is_available():
        raise RuntimeError("This script requires GPU acceleration")
    
    device = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device)
    vram_gb = gpu_properties.total_memory / 1024**3
    logger.info(f"GPU detected: {gpu_properties.name} with {vram_gb:.2f}GB VRAM")
    
    # Optimize settings for L4 GPU
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    
    return device



In [None]:
def collate_fn(batch):
    """Custom collate function for dynamic padding"""
    batch = [b for b in batch if b is not None]
    
    if not batch:
        raise ValueError("Empty batch after filtering")

    # Ensure all batch items have the same keys and structure
    input_ids = pad_sequence([b['input_ids'] for b in batch], batch_first=True)
    attention_mask = pad_sequence([b['attention_mask'] for b in batch], batch_first=True)
    labels = torch.stack([b['labels'] for b in batch])
    metadata = [b['metadata'] for b in batch]
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels,
        'metadata': metadata,
    }



In [None]:
class DynamicTextDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, tokenizer, max_length: int = 3500):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self._length = len(dataframe)

    @property
    def column_names(self):
        return ['input_ids', 'attention_mask', 'labels']

    def __len__(self):
        return self._length

    def __getitem__(self, idx):
        if isinstance(idx, list):  # Handle batched indexing
            batch = [self._get_single_item(i) for i in idx]
            return self._collate_batch(batch)
        elif isinstance(idx, slice):  # Handle slice indexing
            start, stop, step = idx.indices(len(self))
            batch = [self._get_single_item(i) for i in range(start, stop, step)]
            return self._collate_batch(batch)
        elif isinstance(idx, int):
            return self._get_single_item(idx) #single item dictionary
        else:
            raise TypeError(f"Invalid index type: {type(idx)}. Expected int, list, or slice.")

    def _get_single_item(self, idx):
        if idx < 0:
            idx += len(self)
        if idx >= self._length or idx < 0:
            raise IndexError(f"Index {idx} out of bounds for dataset of size {self._length}")

        row = self.dataframe.iloc[idx]
        text = self._read_file(row['snapshot_file'])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors=None
        )
        labels = torch.tensor(row['labels'], dtype=torch.float)

            
        return {
              'input_ids': encoding['input_ids'],
              'attention_mask': encoding['attention_mask'],
              'labels': labels,
              'metadata': {
                'file_path': row['snapshot_file'],
                'index': idx,
            }
        }


    def _collate_batch(self, batch):
        """Collate a batch of individual samples into a single batch dictionary"""
        collated = {key: [] for key in batch[0]}
        for sample in batch:
            for key, value in sample.items():
                collated[key].append(value)

        # Convert lists to tensors where applicable
        collated['input_ids'] = torch.tensor(collated['input_ids'], dtype=torch.long)
        collated['attention_mask'] = torch.tensor(collated['attention_mask'], dtype=torch.long)
        collated['labels'] = torch.stack(collated['labels'])

        return collated

    def _read_file(self, file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except IOError as e:
            logger.error(f"Error reading file {file_path}: {e}")
            return "[ERROR] File could not be read"

    def _parse_labels(self, labels):
        try:
            return torch.tensor(labels, dtype=torch.float)
        except Exception as e:
            logger.error(f"Error parsing labels: {labels}. Exception: {e}")
            return torch.tensor(0.0, dtype=torch.float)

    # Add this method for HuggingFace Trainer compatibility
    def remove_columns(self, column_names):
        return self


In [None]:
import numpy as np
from typing import Tuple

def split_data_by_labels(df):
    """Split dataframe based on NaN labels:
    - First df: rows with no NaN labels
    - Second df: rows with some NaN labels 
    - Third df: rows with all NaN labels
    """
    # Check if labels are all NaN, no NaNs, or some NaNs
    df_train_val = df[~df['labels'].apply(lambda x: pd.isna(x).any())]  # No NaN values at all
    all_nans = df[df['labels'].apply(lambda x: pd.isna(x).all())]   # All values are NaN
    some_nans = df[df['labels'].apply(lambda x: pd.isna(x).any() & ~pd.isna(x).all())]  # Mix of NaN and non-NaN
    
    return df_train_val, some_nans, all_nans

def rolling_window_split(
    df: pd.DataFrame,
    train_window_years: int = 5,
    validation_years: int = 1,
    min_train_years: int = 3,
    stride: int = 1
) -> List[Tuple[pd.DataFrame, pd.DataFrame]]:
    """
    Create rolling window splits based on years with proper temporal separation.
    
    Args:
        df: Input DataFrame with 'year' column
        train_window_years: Number of years to include in training window
        validation_years: Number of years to use for validation
        min_train_years: Minimum number of years required for training
        stride: Number of years to move forward in each split
        
    Returns:
        List of (train_df, val_df) tuples
    """
    years = sorted(df['year'].unique())
    splits = []
    
    # Calculate the total window size
    total_window = train_window_years + validation_years
    
    # Generate splits
    for start_idx in range(0, len(years) - total_window + 1, stride):
        # Define the windows
        train_start = years[start_idx]
        train_end = years[start_idx + train_window_years - 1]
        val_start = years[start_idx + train_window_years]
        val_end = years[start_idx + total_window - 1]
        
        # Create the splits
        train_df = df[
            (df['year'] >= train_start) & 
            (df['year'] <= train_end)
        ]
        
        val_df = df[
            (df['year'] >= val_start) & 
            (df['year'] <= val_end)
        ]
        
        # Only add if we have enough training data
        if len(train_df['year'].unique()) >= min_train_years:
            splits.append((train_df, val_df))
            
            # Log the split information
            logger.info(f"""
            Created split:
            Training: {train_start}-{train_end} ({len(train_df)} samples)
            Validation: {val_start}-{val_end} ({len(val_df)} samples)
            """)
    
    if not splits:
        logger.warning("No valid splits were created with the given parameters")
    else:
        logger.info(f"Created {len(splits)} total splits")
    
    return splits


In [None]:
from torch import nn

def create_model(config: ModelConfig):
    """Create and configure the model with QLoRA"""
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    # bnb_config = BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     bnb_4bit_quant_type="nf4",
    #     bnb_4bit_compute_dtype=torch.float32,  # Use float32 for compute
    #     bnb_4bit_use_double_quant=False  # Disable double quantization
    # )

    model = ModernBertForSequenceClassification.from_pretrained(
        config.base_model,
        num_labels=config.num_labels,
        problem_type="regression",
        torch_dtype=torch.bfloat16        
    )
    
    
    lora_config = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=["Wqkv", "Wo"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS"
    )

    model = get_peft_model(model, lora_config)
    return model

def compute_metrics(eval_pred):
    """Compute comprehensive evaluation metrics"""
    predictions, labels = eval_pred
    if np.isnan(predictions).any():
        logger.error(f"NaNs found in predictions! {predictions}")
        raise ValueError("NaNs in predictions")

    mse = mean_squared_error(labels, predictions, multioutput='raw_values')
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(predictions - labels), axis=0)
    
    metrics = {
        f"mse_label{i+1}": m for i, m in enumerate(mse)
    }
    metrics.update({
        f"rmse_label{i+1}": m for i, m in enumerate(rmse)
    })
    metrics.update({
        f"mae_label{i+1}": m for i, m in enumerate(mae)
    })
    
    metrics["avg_mse"] = np.mean(mse)
    metrics["avg_rmse"] = np.mean(rmse)
    metrics["avg_mae"] = np.mean(mae)
    
    return metrics



In [None]:
import pandas as pd
import numpy as np
from enum import Enum
from typing import Dict, Optional

class RankCombinationMethod(Enum):
    AVERAGE = "average"
    MINIMUM = "minimum"
    WEIGHTED_AVERAGE = "weighted_average"

class FinancialRanking:
    def __init__(self, weights: Optional[Dict[str, float]] = None):
        """
        Initializes the FinancialRanking class.
        
        Args:
            weights (Dict[str, float]): Weights for different prediction periods
                Default: {"next_quarter": 0.2, "next_six_months": 0.3, "next_year": 0.5}
        """
        self.weights = weights or {"next_quarter": 0.2, "next_six_months": 0.3, "next_year": 0.5}
        if not np.isclose(sum(self.weights.values()), 1.0):
            raise ValueError("Weights must sum to 1")

    def _combine_ranks(self, row: pd.Series, method: RankCombinationMethod) -> float:
        """
        Combines per-period ranks into a single score.
        
        Args:
            row (pd.Series): Row containing period ranks
            method (RankCombinationMethod): Method to combine ranks
            
        Returns:
            float: Combined rank score
        """
        ranks = [
            row['next_quarter_rank'],
            row['next_six_months_rank'],
            row['next_year_rank']
        ]
        
        if method == RankCombinationMethod.AVERAGE:
            return np.mean(ranks)
        elif method == RankCombinationMethod.MINIMUM:
            return np.min(ranks)
        elif method == RankCombinationMethod.WEIGHTED_AVERAGE:
            return (
                ranks[0] * self.weights['next_quarter'] +
                ranks[1] * self.weights['next_six_months'] +
                ranks[2] * self.weights['next_year']
            )
        else:
            raise ValueError(f"Unsupported rank combination method: {method}")

    def rank_stocks(self, df: pd.DataFrame, combination_method: RankCombinationMethod = RankCombinationMethod.WEIGHTED_AVERAGE) -> pd.DataFrame:
        """
        Ranks stocks based on predictions using the specified combination method.
        
        Args:
            df (pd.DataFrame): DataFrame with predictions column containing arrays of predictions
            combination_method (RankCombinationMethod): Method to combine period ranks
            
        Returns:
            pd.DataFrame: DataFrame with added ranking columns and sorted by final rank
        """
        # Convert predictions to numpy arrays if they aren't already
        df = df.copy()
        df['predictions'] = df['predictions'].apply(lambda x: np.array(x) if not isinstance(x, np.ndarray) else x)
        
        # Create ranking dataframe for each prediction period
        prediction_df = pd.DataFrame({
            'ticker': df.index,
            'next_quarter': df['predictions'].apply(lambda x: np.mean(x[:2])),
            'next_six_months': df['predictions'].apply(lambda x: np.mean(x[2:4])),
            'next_year': df['predictions'].apply(lambda x: np.mean(x[4:]))
        })
        
        # Calculate ranks for each period (higher predictions get lower ranks)
        prediction_df['next_quarter_rank'] = prediction_df['next_quarter'].rank(ascending=False)
        prediction_df['next_six_months_rank'] = prediction_df['next_six_months'].rank(ascending=False)
        prediction_df['next_year_rank'] = prediction_df['next_year'].rank(ascending=False)
        
        # Calculate combined rank
        prediction_df['combined_score'] = prediction_df.apply(
            lambda row: self._combine_ranks(row, combination_method), 
            axis=1
        )
        
        # Add ranks back to original dataframe
        df_result = pd.concat([
            df,
            prediction_df[['next_quarter_rank', 'next_six_months_rank', 'next_year_rank', 'combined_score']]
        ], axis=1)
        
        # Sort by combined score and add final rank
        df_result = df_result.sort_values('combined_score')
        df_result['rank'] = range(1, len(df_result) + 1)
        
        return df_result

In [None]:
from typing import Any
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

class ScaledLossTrainer(Trainer):
    
    def training_step(self, model, inputs, optimizer=None):
        # print(f"Labels: {inputs['labels']}, metadata: {inputs['metadata']}")
        outputs = model(**inputs)
        loss = outputs.loss
#         # check if loss is na and if so print input labels:
#         # Check if the loss is NaN
#         if torch.isnan(loss):
#             print("NaN loss detected!")
#             print(f"Input labels at time of NaN loss: {inputs['labels']}")

#         if torch.isnan(outputs.logits).any():
#             print("NaN values found in logits!")
#             print(f"Logit values at time of NaN logits: {outputs.logits}")

#         print(f"Outputs: {outputs}")
#         print(f"Loss Before Backpropagation: {loss}")

        loss.backward()

        return loss.detach()

    def log(self, logs: Dict[str, float], iterator: Optional[Any] = None) -> None:
        """Scale up loss values before logging"""
        if "loss" in logs:
            logs["loss"] = logs['loss']
        super().log(logs, iterator)

class LogMetricsCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is not None:
            log_str = "Evaluation Metrics: "
            for k, v in metrics.items():
                log_str += f"{k}: {v:.4f}, "  # Format to 4 decimal places
            log_str = log_str[:-2]  # Remove trailing comma and space
            logger.info(log_str)
        else:
            logger.info("No metrics available at this evaluation step.")

class FinancialPredictor:
    def __init__(self, config: ModelConfig, model_path=None):
        self.config = config
        self.device = setup_environment()
        self.tokenizer = AutoTokenizer.from_pretrained(config.base_model)
        self.model = None
        if model_path:
            self.load_model(model_path)
    
    def load_model(self, model_path):
        """Loads a pretrained model and its tokenizer"""
        try:
            self.model = ModernBertForSequenceClassification.from_pretrained(
                model_path,
                num_labels=self.config.num_labels,
                problem_type="regression",
                torch_dtype=torch.bfloat16
            )
            # Load the LoRA adapter
            config = PeftConfig.from_pretrained(model_path)
            self.model = PeftModel.from_pretrained(self.model, model_path, config=config, torch_dtype=torch.bfloat16).to(self.device)
            logger.info(f"Successfully loaded model and adapter from {model_path}")
        except Exception as e:
            logger.error(f"Failed to load model from {model_path}: {e}")
            raise

        
    def prepare_data(self, df: pd.DataFrame):
        required_columns = ['labels', 'snapshot_file', 'year']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")
            
        df = df.dropna(subset=['snapshot_file', 'year'])
        return split_data_by_labels(df)
        
    def train(self, df: pd.DataFrame):
        df_train_val, _, _ = self.prepare_data(df)
        df = df_train_val.sort_values(by=['year', 'q', 'cik'], ascending=[True, True, True])

        splits = rolling_window_split(
            df_train_val,
            train_window_years=5,  # Use 5 years for training
            validation_years=1,    # Validate on the next year
            min_train_years=3,     # Require at least 3 years of training data
            stride=1              # Move forward 1 year at a time
        )

        
        if not splits:
            raise ValueError("No valid train/validation splits created")
        
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=self.config.num_epochs,
            per_device_train_batch_size=self.config.batch_size,
            per_device_eval_batch_size=self.config.batch_size,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
            warmup_ratio=self.config.warmup_ratio,
            weight_decay=self.config.weight_decay,
            logging_dir='./logs',
            logging_steps=10,
            evaluation_strategy="steps",
            save_strategy="steps",
            save_steps=200,
            eval_steps=100,
            load_best_model_at_end=True,
            save_total_limit=3,
            report_to="wandb",
            metric_for_best_model="avg_rmse",
            greater_is_better=False,
            bf16=True,
            torch_compile=False,
        )

        for i, (train_df, val_df) in enumerate(splits):
            logger.info(f"Training on split {i+1}/{len(splits)}")
            logger.info(f"Train years: {sorted(train_df['year'].unique())}")
            logger.info(f"Validation years: {sorted(val_df['year'].unique())}")
            
            self.model = create_model(self.config)
            
            train_dataset = DynamicTextDataset(train_df, self.tokenizer, self.config.max_length)
            
            if val_df['labels'].isnull().any().any():  # Check for NaNs in labels
                logger.error(f"Validation DataFrame for split {i} contains NaNs in labels!")
                # Optionally, print the rows with NaNs for further inspection
                logger.error(val_df[val_df['labels'].isnull().any(axis=1)])
                raise ValueError("NaNs found in validation labels")


            val_dataset = DynamicTextDataset(val_df, self.tokenizer, self.config.max_length)
            
            trainer = ScaledLossTrainer(
                model=self.model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                compute_metrics=compute_metrics,
                data_collator=collate_fn,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=3), LogMetricsCallback]
            )
            
            trainer.train()
            
            output_dir = f"./fine_tuned_model_split_{i}"
            self.model.save_pretrained(output_dir)
            
            wandb.log({
                f"split_{i}_train_years": sorted(train_df['year'].unique()),
                f"split_{i}_val_years": sorted(val_df['year'].unique()),
                f"split_{i}_final_metrics": trainer.state.best_metric
            })
        
        return trainer

    def predict(self, df: pd.DataFrame, batch_size: int = 2) -> pd.DataFrame:
        df = get_latest_files(df)
        dataset = DynamicTextDataset(df, self.tokenizer, self.config.max_length)
        dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)
        
        predictions = []
        metadata_list = []
        self.model.eval()
        with torch.no_grad():
            for batch in dataloader:
                outputs = self.model(
                    input_ids=batch['input_ids'].to(self.device),
                    attention_mask=batch['attention_mask'].to(self.device, dtype=torch.bfloat16)
                )
                predictions.extend(outputs.logits.cpu().float().numpy())
                metadata_list.extend(batch['metadata'])
        
        df['predictions'] = predictions
        
        return df
    
    def predict_and_rank(self, df: pd.DataFrame) -> pd.DataFrame:
        """Combine prediction and ranking in one step"""
        predictions_df = self.predict(df)
        ranker = FinancialRanking()
        ranked_df = ranker.rank_stocks(predictions_df)
        return ranked_df



In [None]:
def get_latest_files(df: pd.DataFrame) -> pd.DataFrame:
    """Get the most recent file for each CIK"""
    return df.sort_values(['cik', 'year', 'q']).drop_duplicates('cik', keep='last')





In [None]:
wandb.login()

In [None]:
# Initialize wandb
wandb.init(
    project="modernbert-finetuning-lora-rolling-window", 
    entity=os.environ['WANDB_ENTITY'],
    settings=wandb.Settings(init_timeout=120),
    config={
        "model": "answerdotai/modernbert-large",
        "max_length": 3300,
        "batch_size": 3,
        "learning_rate": 1e-4
    }
)


In [None]:
# Load config
config = ModelConfig()
print("Configuration loaded successfully")

# Load data
print("Loading data from parquet file...")
df = pd.read_parquet('training_data.parquet')
print(f"Loaded dataset with {len(df)} rows")



In [None]:
# Initialize predictor
# predictor = FinancialPredictor(config)
# Initialize predictor
# Find the latest saved model directory
model_path = None
list_of_files = os.listdir("./")
fine_tuned_dirs = [f for f in list_of_files if f.startswith(f"fine_tuned_{config.base_model.split('/')[1]}_split_")]
if fine_tuned_dirs:
    model_path = sorted(fine_tuned_dirs, key=lambda f: int(f.split("_")[-1]))[-1]
    print(f"Loading latest saved model from: {model_path}")
else:
    model_path = "./final_model"
    print(f"Using default model path: {model_path}")

predictor = FinancialPredictor(config, model_path=model_path) # Uncomment if you want to load an already trained model

# Split data
print("Splitting data based on label availability...")
df_train_val, df_some_null, df_all_null = predictor.prepare_data(df)



In [None]:
pd.set_option('display.max_colwidth', None)

import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your labels are lists/arrays of floats
all_labels = [item for sublist in df_train_val['labels'] for item in sublist]

plt.figure(figsize=(10, 6))
sns.histplot(all_labels, kde=True)
plt.title('Distribution of Labels')
plt.xlabel('Label Value')
plt.ylabel('Frequency')
plt.show()

print(f"Label Statistics:")
print(f"Mean: {np.mean(all_labels)}")
print(f"Median: {np.median(all_labels)}")
print(f"Standard Deviation: {np.std(all_labels)}")
print(f"Min: {np.min(all_labels)}")
print(f"Max: {np.max(all_labels)}")
print(f"Number of zero labels: {len(list(filter(lambda x: x == 0, all_labels)))}")

# Check for positive infinity
inf_labels_count = np.isinf(all_labels).sum()
print(f"Number of positive infinity labels: {inf_labels_count}")

# Check for negative infinity
neg_inf_labels_count = np.isneginf(all_labels).sum()
print(f"Number of negative infinity labels: {neg_inf_labels_count}")
 


In [None]:
def get_text_length(file_path):
    with open(file_path, 'r') as f:
        return len(f.read())
    
df_train_val['text_length'] = df_train_val['snapshot_file'].apply(get_text_length)

plt.figure(figsize=(10, 6))
sns.histplot(df_train_val['text_length'], kde=True)
plt.title('Distribution of Text Lengths')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

print(df_train_val['text_length'].describe())


In [None]:
zero_length_indices = df_train_val[df_train_val['text_length'] == 0].index

# Remove rows with zero-length files in place
df_train_val.drop(zero_length_indices, inplace=True)

# Reset the index if needed
df_train_val.reset_index(drop=True, inplace=True)

print(f"Number of zero-length files removed: {len(zero_length_indices)}")

# It's good practice to remove the temporary 'text_length' column
df_train_val.drop(columns=['text_length'], inplace=True)


In [None]:
df_all_null

In [None]:
df_some_null[df_some_null['labels'].apply(lambda labels: pd.isna(labels[2:]).all())]


In [None]:
# Log data split information
wandb.log({
    "train_val_count": len(df_train_val),
    "some_null_count": len(df_some_null),
    "all_null_count": len(df_all_null),
    "total_samples": len(df)
})

print(f"""
Data split summary:
- Training/Validation set: {len(df_train_val)} samples
- Partial labels set: {len(df_some_null)} samples
- No labels set: {len(df_all_null)} samples
""")



In [None]:
# Train model
import torch._dynamo
torch._dynamo.config.suppress_errors = True

print("Starting model training...")
trainer = predictor.train(df_train_val)
print("Model training completed")


In [None]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True
min_year = 2024
combined_df = pd.concat([df_some_null, df_all_null], ignore_index=True)

# Filter for recent years
recent_df = combined_df[combined_df['year'] & combined_df['labels'].apply(lambda labels: pd.isna(labels[2:]).all())]
recent_df



In [None]:
# Get most recent snapshot for each company
latest_snapshots = recent_df.sort_values(['cik', 'year', 'q'], ascending=[True, True, True]) \
                           .groupby('cik').last().reset_index()
predictions_df = predictor.predict(latest_snapshots)

predictions_df

In [None]:
# Rank predictions
logger.info("Ranking stocks based on predictions...")
ranker = FinancialRanking()
ranked_df = ranker.rank_stocks(predictions_df)
ranked_df

In [None]:
import sec_cik_mapper

stock_mapper = sec_cik_mapper.StockMapper()
cik_to_tickers = stock_mapper.cik_to_tickers
cik_to_name = stock_mapper.cik_to_company_name

In [None]:
ranked_df['tickers'] = ranked_df['cik'].apply(cik_to_tickers.get)
ranked_df['name'] = ranked_df['cik'].apply(cik_to_name.get)
pd.set_option('display.max_rows', None)
ranked_df

In [None]:

# Save results
output_file = 'ranked_predictions_base.csv'
ranked_df.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")

# Log final results
wandb.log({
    "top_10_stocks": wandb.Table(dataframe=ranked_df.head(10)),
    "final_model_performance": trainer.state.best_metric if trainer else None
})

# Save model artifacts
final_model_path = "./final_model"
predictor.model.save_pretrained(final_model_path)
predictor.tokenizer.save_pretrained(final_model_path)
wandb.finish()
logger.info("Weights & Biases logging completed")
