In [None]:
!pip install pandas numpy scikit-learn mlflow ydata-profiling openpyxl PyGithub xgboost psutil fastapi uvicorn streamlit plotly alibi-detect

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from ydata_profiling import ProfileReport
from github import Github
import base64
import warnings
import pickle
import joblib
warnings.filterwarnings('ignore')
import io
import os

# 1. GitHubManager

This class is responsible for managing the interaction between the script and a GitHub repository. It handles uploading and updating files such as models, datasets, or reports to the repository.

## Purpose
To automate the process of saving data artifacts, reports, and machine learning models to GitHub for version control and collaboration.

## Key Methods

### `__init__(self, token=None, repo_name="Group8MLUL2/Group8_CT1")`
- Initializes the connection to the GitHub repository.
- Takes a personal access token (`token`) and repository name (`repo_name`) as inputs.
- Uses the GitHub API (via the `PyGithub` library) to authenticate and fetch the specified repository.

### `create_or_update_file(self, path, content, message)`
- Creates or updates a file in the repository.

#### Steps:
1. Serializes the content if it is a model (`Pipeline` object) or a pandas DataFrame (e.g., to pickle or Parquet format).
2. Encodes the content in Base64 (required for GitHub's API).
3. Checks if the file exists in the repository.  
   - If it does: Updates the file with the new content.
   - If it doesn’t: Creates a new file in the specified path.

#### Handles:
- ML models (as pickle files).
- Processed data (e.g., Parquet format).
- Text-based reports (e.g., HTML or plain text).


In [None]:
class GitHubManager:
    """
    A class to handle interactions with a GitHub repository, such as uploading files
    or updating models.
    """
    def __init__(self, token=None, repo_name="Group8MLUL2/Group8_CT1"):
        """
        Initializes the GitHubManager with authentication and repository details.

        Parameters:
        - token (str): GitHub Personal Access Token for authentication.
        - repo_name (str): Name of the repository to interact with (e.g., 'username/repo').
        """
        self.github = Github(token) if token else Github()
        self.repo = self.github.get_repo(repo_name)
        print(f"Connected to repository: {self.repo.html_url}")

    def create_or_update_file(self, path, content, message):
        """
        Creates or updates a file in the GitHub repository.

        Parameters:
        - path (str): Path to the local file to be uploaded.
        - content (str): Path in the repository where the file will be stored.
        - message (str): Commit message to include with the file update.
        """
        try:
            # Handle different content types
            if isinstance(content, Pipeline) or hasattr(content, 'predict'):
                # Serialize model using pickle
                content_bytes = pickle.dumps(content)
            elif isinstance(content, pd.DataFrame):
                # Convert DataFrame to parquet bytes directly
                buffer = io.BytesIO()
                content.to_parquet(buffer)
                content_bytes = buffer.getvalue()
            elif isinstance(content, str):
                content_bytes = content.encode()
            else:
                content_bytes = content

            
            try:
                # Try to get the file contents to check if it exists
                contents = self.repo.get_contents(path)
                # File exists, update it
                self.repo.update_file(path, message, content_bytes, contents.sha)
                print(f"Updated {path}")
            except Exception as e:
                # File doesn't exist, create it
                self.repo.create_file(path, message, content_base64)
                print(f"Created {path}")
                
        except Exception as e:
            print(f"Error with {path}: {str(e)}")
            raise

# Data Preparation for Machine Learning

To prepare the dataset for machine learning by cleaning, transforming, and generating meaningful features.

## Key Methods

### `load_data()`
- Loads the raw dataset from a GitHub-hosted Excel file using `pd.read_excel()`.
- Prints the shape of the dataset upon successful loading.

---

### `preprocess_data(df)`
Cleans the raw dataset by:
1. Removing rows with missing values (`dropna()`).
2. Filtering out canceled orders (invoices starting with "C") and invalid data (e.g., `Quantity <= 0` or `UnitPrice <= 0`).
3. Converting the `InvoiceDate` column to a datetime object.
4. Extracting features like `Hour`, `Day`, `Month`, and `Year` from `InvoiceDate`.
5. Calculating `TotalAmount` as `Quantity * UnitPrice`.

**Output**: A cleaned and preprocessed dataset ready for feature engineering.

---

### `create_features(df)`
Aggregates customer-level features from the transactional data using `groupby()`:

- Purchase frequency (`num_purchases`).
- Quantity statistics (total, mean, std).
- Transaction amount statistics (total, mean, std).
- Average and standard deviation of unit prices.
- The first occurrence of the customer's country.

**Output**: A DataFrame of aggregated features for machine learning.

---

### `split_data(features, random_state=42)`
Splits the feature dataset into three sets:
- **Train**: 60% of the data for training models.
- **Test**: 20% of the data for model evaluation.
- **Production**: 20% of the data reserved for deployment scenarios.

Uses `train_test_split()` from `sklearn` to ensure reproducibility with `random_state`.


In [None]:
class DataProcessor:
    @staticmethod
    def load_data():
        """Load data from the repository"""
        url = "https://raw.githubusercontent.com/Group8MLUL2/Group8_CT1/main/Online%20Retail.xlsx"
        df = pd.read_excel(url)
        print(f"Data loaded successfully. Shape: {df.info()}")
        print(df)
        return df

    @staticmethod
    def preprocess_data(df):
        """Preprocess the dataset"""
        print("Preprocessing data...")
        print(f"Initial shape: {df.shape}")
        
        # Remove rows with missing values
        df = df.dropna()
        print(f"Shape after removing missing values: {df.shape}")
        
        # Convert InvoiceDate to datetime
        df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
        
        # Extract features from datetime
        df['Hour'] = df['InvoiceDate'].dt.hour
        df['Day'] = df['InvoiceDate'].dt.day
        df['Month'] = df['InvoiceDate'].dt.month
        df['Year'] = df['InvoiceDate'].dt.year
        
        # Calculate total amount
        df['TotalAmount'] = df['Quantity'] * df['UnitPrice']
        
        # Filter out cancelled orders and invalid quantities/prices
        df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
        df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
        print(f"Final shape: {df.shape}")
        
        return df

    @staticmethod
    def create_features(df):
        """Create features for ML"""
        print("Creating features...")
        features = df.groupby('CustomerID').agg({
            'InvoiceNo': 'count',
            'Quantity': ['sum', 'mean', 'std'],
            'TotalAmount': ['sum', 'mean', 'std'],
            'UnitPrice': ['mean', 'std'],
            'Country': 'first'
        }).reset_index()
        
        # Flatten column names
        features.columns = ['CustomerID', 'num_purchases', 'total_quantity', 
                          'avg_quantity', 'std_quantity', 'total_amount', 
                          'avg_transaction', 'std_transaction', 'avg_unit_price', 
                          'std_unit_price', 'country']
        
        print(f"Features created. Shape: {features.shape}")
        return features

    @staticmethod
    def split_data(features, random_state=42):
        """Split data into train, test, and production sets"""
        train, remaining = train_test_split(features, train_size=0.6, random_state=random_state)
        test, prod = train_test_split(remaining, test_size=0.5, random_state=random_state)
        return train, test, prod

# 3. MLPipeline

This class manages the machine learning pipeline, including preprocessing, model training, experiment tracking, and evaluation.

## Purpose
To streamline the process of training and evaluating multiple machine learning models, while logging metrics and artifacts in MLflow.

## Key Methods

### `__init__()`
- Sets up MLflow for tracking experiments with a predefined `experiment_name`.
- Points to an MLflow tracking server (default URI: `http://localhost:5000`).

---

### `create_preprocessing_pipeline()`
Builds a `ColumnTransformer` pipeline to preprocess:

#### Numerical Features:
- Imputed with the median of the column.
- Scaled using `StandardScaler()` for normalization.

#### Categorical Features:
- Imputed with a constant value (`'missing'`).
- One-hot encoded (dropping the first category to avoid collinearity).

**Returns**: A preprocessing pipeline ready to be plugged into a full ML pipeline.

---

### `run_experiment(model, model_name, X_train, y_train, X_test, y_test, params=None)`
Runs a single machine learning experiment:
1. Combines the preprocessing pipeline with a given model.
2. Tracks parameters (`params`) and metrics (e.g., R², RMSE) in MLflow.
3. Performs K-fold cross-validation to evaluate the model.
4. Logs the trained model to MLflow (using appropriate format for `xgboost` or `sklearn`).

---

### `run_experiments(X_train, y_train, X_test, y_test)`
Runs multiple experiments with different regression models:
- **Linear Regression**
- **Decision Tree Regressor**
- **Random Forest Regressor**
- **Support Vector Regressor**
- **XGBoost Regressor**

Tracks the performance of each model and identifies the best-performing model based on test R².


In [None]:
class MLPipeline:
    def __init__(self):
        mlflow.set_tracking_uri("http://localhost:5000")
        self.experiment_name = "retail_sales_prediction"
        mlflow.set_experiment(self.experiment_name)
        
    def create_preprocessing_pipeline(self):
        """Create preprocessing pipeline for features"""
        numeric_features = ['num_purchases', 'total_quantity', 'avg_quantity', 
                          'std_quantity', 'avg_transaction', 'std_transaction', 
                          'avg_unit_price', 'std_unit_price']
        categorical_features = ['country']
        
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            # Set handle_unknown='ignore' to handle new categories
            ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
        ])
        
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])
        
        return preprocessor
        
    def run_experiment(self, model, model_name, X_train, y_train, X_test, y_test, params=None):
        """Run a single experiment with MLflow tracking"""
        # Create preprocessing pipeline
        preprocessor = self.create_preprocessing_pipeline()
        
        # Create full pipeline
        full_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # Log parameters
        if params:
            mlflow.log_params(params)
        mlflow.log_params({
            'model_type': model_name,
            'training_samples': X_train.shape[0],
            'features': X_train.shape[1]
        })
        
        # K-fold cross-validation
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(full_pipeline, X_train, y_train, 
                                  cv=kf, scoring='r2')
        
        # Train model
        full_pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = full_pipeline.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        
        # Log metrics
        mlflow.log_metrics({
            'cv_r2_mean': cv_scores.mean(),
            'cv_r2_std': cv_scores.std(),
            'test_mse': mse,
            'test_rmse': rmse,
            'test_r2': r2,
            'test_mae': mae
        })
        
        # Log model
        if isinstance(model, xgb.XGBRegressor):
            mlflow.xgboost.log_model(model, model_name)
        else:
            mlflow.sklearn.log_model(full_pipeline, model_name)
        
        print(f"\nResults for {model_name}:")
        print(f"Cross-validation R2: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
        print(f"Test R2: {r2:.4f}")
        print(f"Test RMSE: {rmse:.4f}")
        print(f"Test MAE: {mae:.4f}")
        
        return full_pipeline, {
            'cv_r2_mean': cv_scores.mean(),
            'test_r2': r2,
            'test_rmse': rmse
        }

    def run_experiments(self, X_train, y_train, X_test, y_test):
        """Run multiple experiments with different models and configurations"""
        experiments = [
            {
                'name': 'LinearRegression',
                'model': LinearRegression(),
                'params': {}
            },
            {
                'name': 'DecisionTree',
                'model': DecisionTreeRegressor(random_state=42),
                'params': {
                    'max_depth': 10,
                    'min_samples_split': 5
                }
            },
            {
                'name': 'RandomForest',
                'model': RandomForestRegressor(random_state=42),
                'params': {
                    'n_estimators': 100,
                    'max_depth': 15,
                    'min_samples_split': 5
                }
            },
            {
                'name': 'SVR',
                'model': SVR(),
                'params': {
                    'kernel': 'rbf',
                    'C': 1.0,
                    'epsilon': 0.1
                }
            },
            {
                'name': 'XGBoost',
                'model': xgb.XGBRegressor(random_state=42),
                'params': {
                    'n_estimators': 100,
                    'max_depth': 6,
                    'learning_rate': 0.1
                }
            }
        ]
        
        results = {}
        best_model = None
        best_score = -float('inf')
        best_run_id = None
        
        for experiment in experiments:
            print(f"\nRunning experiment: {experiment['name']}")
            with mlflow.start_run() as run:
                model, metrics = self.run_experiment(
                    experiment['model'],
                    experiment['name'],
                    X_train, y_train,
                    X_test, y_test,
                    experiment['params']
                )
                
                results[experiment['name']] = metrics
                run_id = run.info.run_id
                
                # Track best model
                if metrics['test_r2'] > best_score:
                    best_score = metrics['test_r2']
                    best_model = model
                    best_run_id = run_id

        # Save best model in a separate run
        with mlflow.start_run() as run:
            best_model_name = max(results.items(), key=lambda x: x[1]['test_r2'])[0]
            mlflow.log_param('best_model_type', best_model_name)
            mlflow.log_param('best_run_id', best_run_id)
            mlflow.log_metrics({
                'best_model_r2': best_score
            })
            # Save the model directly
            mlflow.sklearn.log_model(best_model, 'best_model')
            model_uri = mlflow.get_artifact_uri('best_model')
        
        return results, best_model, model_uri

# Workflow Overview

## 1. Initialize GitHubManager
- Authenticate with GitHub using a personal access token and connect to the specified repository.

---

## 2. Data Processing
- **Load the raw dataset.**
- **Preprocess and clean the data.**
- **Create customer-level features.**
- **Split the data into train, test, and production sets.**

---

## 3. Generate Profile Report
- Create a profile report for the dataset using `ydata_profiling`.
- Upload the profile report to GitHub.


In [None]:
 # Data Processing Pipeline
print("\n=== Data Processing Pipeline ===")
processor = DataProcessor()

# Load and process data
df = processor.load_data()
processed_df = processor.preprocess_data(df)
features_df = processor.create_features(processed_df)

# Split data
train_data, test_data, prod_data = processor.split_data(features_df)

# Generate profile report
print("\nGenerating profile report...")
profile = ProfileReport(processed_df, title="Online Retail Dataset Profile")

# Save files back to the repository
print("\nSaving files to repository...")
github_manager.create_or_update_file(
    "processed/train_data.parquet",
    train_data,
    "Update training data"
)
github_manager.create_or_update_file(
    "processed/test_data.parquet",
    test_data,
    "Update test data"
)
github_manager.create_or_update_file(
    "processed/prod_data.parquet",
    prod_data,
    "Update production data"
)
github_manager.create_or_update_file(
    "reports/profile_report.html",
    profile.to_html(),
    "Update profile report"
)

# Model Training and Saving Workflow

## 1. Train Models
- **Initialize the `MLPipeline`.**
- **Train and evaluate multiple regression models.**
- **Log metrics and artifacts to MLflow.**

---

## 2. Save Best Model
- **Identify the best model** based on test R².
- Save the model in multiple formats (e.g., **Pickle**, **Joblib**).
- **Upload the best model** to the GitHub repository.


In [None]:
import warnings

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning, module="mlflow.models.model")
    # Code that logs the model
    mlflow.sklearn.log_model(
        sk_model=my_model,  # Replace with your model
        artifact_path="model"
    )

In [None]:
# ML Pipeline
print("\n=== ML Pipeline ===")
# Prepare features and target
target_column = 'total_amount'
feature_columns = [col for col in train_data.columns 
                 if col not in [target_column, 'CustomerID']]

X_train = train_data[feature_columns]
y_train = train_data[target_column]
X_test = test_data[feature_columns]
y_test = test_data[target_column]

# Initialize and run ML pipeline
ml_pipeline = MLPipeline()
results, best_model, model_path = ml_pipeline.run_experiments(X_train, y_train, X_test, y_test)

In [None]:
# Print final results
print("\n=== Final Results ===")
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    print(f"CV R2: {metrics['cv_r2_mean']:.4f}")
    print(f"Test R2: {metrics['test_r2']:.4f}")
    print(f"Test RMSE: {metrics['test_rmse']:.4f}")

print("\nSaving best model to repository...")
# Save the best model directly
github_manager.create_or_update_file(
    "models/best_model.pkl",
    best_model,  # Pass the model object directly
    "Update best model"
)
# Ensure models directory exists
os.makedirs('models', exist_ok=True)

# Method 1: Joblib save (recommended for sklearn models)
joblib.dump(best_model, 'models/best_model_joblib.pkl', compress=True)

# Method 2: Pickle save (alternative method)
with open('models/best_model_pickle.pkl', 'wb') as f:
    pickle.dump(best_model, f, protocol=pickle.HIGHEST_PROTOCOL)
    
# Also save model using joblib for backup
# Method 1: Save to file
joblib.dump(best_model, 'best_model_joblib.pkl')

# Method 2: Save to a bytes buffer
model_buffer = io.BytesIO()
joblib.dump(best_model, model_buffer)
model_bytes = model_buffer.getvalue()

# When uploading to GitHub, use model_bytes
github_manager.create_or_update_file(
    "models/best_model_joblib.pkl",
    model_bytes,
    "Update best model (joblib format)"
)

print("\nPipeline completed successfully!")
print(f"Repository URL: {github_manager.repo.html_url}")