### Future Team Prediction
Predict the future team of a driver based on past team transitions and transfer trends.


In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

class DriverTeamPredictor:
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.model = None
        self.valid_teams = None
        self.feature_columns = [
            'driver_encoded', 'team_encoded', 'driver_experience',
            'driver_points_avg', 'driver_positions_avg', 'stability_score',
            'team_changes', 'recent_performance'
        ]

    def prepare_features(self, df):
        """Preprocess and engineer features with full validation"""
        try:
            # Validate input structure
            required_cols = ['driverName', 'constructorName', 'season', 
                           'driverRacePoints', 'driverFinalRank']
            missing = list(set(required_cols) - set(df.columns))
            if missing:
                raise ValueError(f"Missing columns: {missing}")

            df = df.copy()
            
            # Handle missing values
            df = df.fillna({
                'driverRacePoints': 0,
                'driverFinalRank': df['driverFinalRank'].median()
            })

            # Convert season to integer
            df['season'] = pd.to_numeric(df['season'], errors='coerce')
            df = df[df['season'].notna()].astype({'season': int})

            # Feature engineering
            df['driver_experience'] = df.groupby('driverName')['season'].rank(method='dense')
            
            # Team change tracking
            df['previous_team'] = df.sort_values('season').groupby('driverName')['constructorName'].shift(1)
            df['team_changes'] = (df['constructorName'] != df['previous_team']).astype(int)
            df['team_changes'] = df.groupby('driverName')['team_changes'].cumsum()

            # Team stability calculation
            team_stats = df.groupby('constructorName').agg(
                total_drivers=('driverName', 'nunique'),
                total_seasons=('season', 'count')
            ).reset_index()
            team_stats['stability_score'] = team_stats['total_seasons'] / team_stats['total_drivers']
            df = df.merge(team_stats[['constructorName', 'stability_score']], 
                         on='constructorName', how='left')

            # Performance metrics
            df['driver_points_avg'] = df.groupby('driverName')['driverRacePoints'].transform('mean')
            df['driver_positions_avg'] = df.groupby('driverName')['driverFinalRank'].transform('mean')
            df['recent_performance'] = df.groupby('driverName')['driverRacePoints'].transform(
                lambda x: x.rolling(3, min_periods=1).mean()
            )

            return df.fillna(0)
        except Exception as e:
            raise RuntimeError(f"Feature preparation failed: {str(e)}")

    def create_consistent_dataset(self, df):
        """Create dataset with consistent team encoding"""
        try:
            # Create next_team target
            df = df.sort_values(['driverName', 'season'])
            df['next_team'] = df.groupby('driverName')['constructorName'].shift(-1)
            df = df.dropna(subset=['next_team'])
            
            # Find teams that appear in both current and next positions
            current_teams = set(df['constructorName'])
            future_teams = set(df['next_team'])
            valid_teams = list(current_teams.intersection(future_teams))
            
            # Filter dataset
            df = df[df['constructorName'].isin(valid_teams) & df['next_team'].isin(valid_teams)]
            
            # Ensure minimum samples per class
            team_counts = df['next_team'].value_counts()
            self.valid_teams = team_counts[team_counts >= 2].index.tolist()
            df = df[df['next_team'].isin(self.valid_teams)]
            
            if len(self.valid_teams) < 2:
                raise ValueError("Insufficient teams for classification (need at least 2)")
                
            return df
        except Exception as e:
            raise RuntimeError(f"Dataset creation failed: {str(e)}")

    def train(self, df):
        """Train model with guaranteed label consistency"""
        try:
            print("Preparing data...")
            df = self.prepare_features(df)
            df = self.create_consistent_dataset(df)
            
            # Fit encoder on final valid teams
            self.label_encoder.fit(self.valid_teams)
            
            # Encode features and target
            df['team_encoded'] = self.label_encoder.transform(df['constructorName'])
            df['driver_encoded'] = LabelEncoder().fit_transform(df['driverName'])
            y = self.label_encoder.transform(df['next_team'])
            X = df[self.feature_columns]
            
            # Split data with class balance validation
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, 
                test_size=0.2, 
                random_state=42,
                stratify=y
            )
            
            # Configure and train model
            self.model = xgb.XGBClassifier(
                n_estimators=200,
                learning_rate=0.05,
                max_depth=5,
                objective='multi:softprob',
                use_label_encoder=False,
                eval_metric='mlogloss',
                early_stopping_rounds=10
            )
            
            self.model.fit(
                X_train, y_train,
                eval_set=[(X_test, y_test)],
                verbose=False
            )
            
            # Evaluation
            y_pred = self.model.predict(X_test)
            print(f"\nModel Accuracy: {accuracy_score(y_test, y_pred):.2f}")
            print("Classification Report:")
            print(classification_report(y_test, y_pred, 
                                      target_names=self.label_encoder.classes_))
            
            return self
        except Exception as e:
            raise RuntimeError(f"Training failed: {str(e)}")

    def predict_future_team(self, driver_data):
        """Predict next team with strict encoding rules"""
        try:
            if not self.model:
                raise RuntimeError("Model not trained - call train() first")
                
            # Prepare features
            features = self.prepare_features(driver_data)
            current_team = features['constructorName'].iloc[-1]
            
            # Handle unknown teams
            if current_team not in self.valid_teams:
                raise ValueError(f"Team '{current_team}' not in trained teams list")
            
            # Encode features
            features['team_encoded'] = self.label_encoder.transform(features['constructorName'])
            features['driver_encoded'] = LabelEncoder().fit_transform(features['driverName'])
            
            # Predict using most recent entry
            X_pred = features[self.feature_columns].iloc[[-1]]
            pred_encoded = self.model.predict(X_pred)
            
            return self.label_encoder.inverse_transform(pred_encoded)[0]
        except Exception as e:
            raise RuntimeError(f"Prediction failed: {str(e)}")

def main():
    """Example execution with error handling"""
    try:
        # Load dataset
        data = pd.read_csv('../../../data/processed/fully_integrated_data.csv')
        
        # Initialize and train
        predictor = DriverTeamPredictor()
        predictor.train(data)
        
        # Test prediction
        test_data = pd.DataFrame([
            {'driverName': 'Max Verstappen', 'constructorName': 'Red Bull', 
             'season': 2021, 'driverRacePoints': 395, 'driverFinalRank': 1},
            {'driverName': 'Max Verstappen', 'constructorName': 'Red Bull', 
             'season': 2022, 'driverRacePoints': 454, 'driverFinalRank': 1},
            {'driverName': 'Max Verstappen', 'constructorName': 'Red Bull', 
             'season': 2023, 'driverRacePoints': 575, 'driverFinalRank': 1}
        ])
        
        predicted_team = predictor.predict_future_team(test_data)
        print(f"\n2024 Prediction for {test_data['driverName'].iloc[0]}:")
        print(f"Current Team: {test_data['constructorName'].iloc[-1]}")
        print(f"Predicted Next Team: {predicted_team}")
        
    except Exception as e:
        print(f"Application Error: {str(e)}")

if __name__ == "__main__":
    main()

Preparing data...
Application Error: Training failed: y contains previously unseen labels: np.str_('Cooper-Borgward')
