# Ensemble Methods

In [1]:
# Basic Libraries
import pandas as pd
import numpy as np
import warnings

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, classification_report)

# LightGBM and XGBoost
import lightgbm as lgb
import xgboost as xgb

# PyTorch for Neural Networks
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Neural Network and Custom Dataset
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

import pickle

# Ignore Warnings
warnings.filterwarnings("ignore")




In [2]:
# Load Data
df = pd.read_csv("cleaned data/cleaned_data_split.csv")

mds = df[df['X_fold'] == 'test']

In [3]:
def load_model(model_path):
    with open(model_path, 'rb') as f:
        return pickle.load(f)


In [4]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features.astype('float32').values, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'features': self.features[idx],
            'label': self.labels[idx]
        }

class AccountTypeClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 64], num_classes=2, dropout_rate=0.3):
        super(AccountTypeClassifier, self).__init__()
        
        # Build layers dynamically
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
        
        # Output layer
        self.layers = nn.Sequential(*layers)
        self.output = nn.Linear(prev_dim, num_classes)
        
    def forward(self, x):
        x = self.layers(x)
        return self.output(x)


In [5]:
modelrf = load_model('output_files/{0}_best_model.pkl'.format('rf'))
modelxgb = load_model('output_files/{0}_best_model.pkl'.format('xgb'))
modellgbm = load_model('output_files/{0}_best_model.pkl'.format('lgbm'))
modelnn = load_model('output_files/{0}_best_model.pkl'.format('nn'))


In [6]:
predictive_cols_log = ['default_profile', 'default_profile_image',
        'geo_enabled', 'deviation_from_humans', 'location', 'verified',
        'account_age_days', 'is_description_na', 'is_lang_na', 'is_lang_en',
       'is_location_unknown', 'creation_hour', 'creation_day_of_week',
       'creation_month', 'creation_year', 'is_weekend', 'creation_quarter',
       'part_of_day', 'creation_week_of_year', 'is_beginning_of_month',
       'is_end_of_month', 'description_length', 'influencer_type',
       'favourites_per_day', 'favourites_activity',
       'mention_count', 'log_favourites_count', 'log_followers_count', 'log_friends_count',
       'log_statuses_count', 'log_average_tweets_per_day',
       'log_fol_to_friends_ratio', 'log_fol_to_tweets_ratio',
       'log_friends_to_tweets_ratio', 'sentiment_label', 'account_type']

features = [col for col in predictive_cols_log if col not in ['id', 'account_type', 'X_fold']]
target = ['account_type']

### Ensembling Method for Model Combination

The ensembling method aims to combine multiple predictive models, specifically traditional models (XGBoost, LightGBM, and optionally RandomForest) and a neural network model, to improve performance. The ensemble strategy involves weighted averaging of the predictions from the traditional models and the neural network model. Below is a summary of the approach:

1. **Model Pairing**:
   - The ensemble combines two types of models: traditional machine learning models (XGBoost, LightGBM, etc.) and a neural network model. These models are paired in various combinations for testing.
   - In this example, the models used are:
     - Random Forest (`modelrf`)
     - Neural Network (`modelnn`)

2. **Weighting**:
   - Different weight combinations are tested for the ensemble predictions. The weights determine how much influence each model's predictions have in the final combined prediction. In the example, the following weight combinations are considered:
     - (0.7, 0.3)
     - (0.5, 0.5)
     - (0.3, 0.7)
   - `w1` and `w2` correspond to the weights assigned to the traditional model predictions and neural network predictions, respectively.

3. **Prediction Process**:
   - For neural network predictions, the model outputs probabilities (for binary classification) that are used in the ensemble.

4. **Ensemble Calculation**:
   - The predictions from the traditional model (`preds`) and the neural network (`nn_preds`) are combined using the weighted average formula:
     \[
     $\text{ensemble\_preds} = (w1 \times \text{preds}) + (w2 \times \text{nn\_preds})$
     \]
   - The combined prediction is then thresholded (at 0.5) to convert probabilities into binary labels (0 or 1).

5. **Metrics**:
   - The performance of the ensemble is evaluated using several classification metrics:
     - **Accuracy**: The proportion of correct predictions.
     - **AUC (Area Under the Curve)**: A measure of the model's ability to distinguish between the classes.
     - **Recall**: The proportion of actual positives correctly identified by the model.
     - **Precision**: The proportion of predicted positives that are actually correct.
     - **F1-Score**: The harmonic mean of precision and recall.

6. **Best Model and Weight Combination**:
   - After testing all combinations, the best model and weight combination is selected based on the highest F1-Score. This combination is considered the most balanced, ensuring both high precision and recall.

7. **Output**:
   - The model, weight combination, and corresponding metrics (Accuracy, AUC, Recall, Precision, F1-Score) are printed for each iteration.
   - Finally, the best model and weight combination are displayed, showing the highest performance across the evaluated metrics.



In [7]:
# Define weight combinations and model pairs
weights = [(0.7, 0.3), (0.5, 0.5), (0.3, 0.7)]
model_pairs = [('RandomForest', modelrf)]

nn_model = modelnn  # Neural network model instance

# Prepare data
X = mds[features]
y = mds[target]
# Convert X to a tensor for NN model
X_tensor = torch.tensor(X.values, dtype=torch.float32)  

# Get predictions from the neural network model
with torch.no_grad():
    nn_outputs = nn_model(X_tensor).numpy() 
    nn_preds = nn_outputs[:, 1]  

# Initialize variables to track the best combination and metrics
best_combination = None
best_metrics = {}

# Iterate through each model pair
for model_name, model in model_pairs:
    try:
            preds = cross_val_predict(model, X, y, cv=3, method='predict_proba')[:, 1]
    except IndexError:
            preds = cross_val_predict(model, X, y, cv=3, method='predict')

    # Iterate through each weight combination
    for w1, w2 in weights:
        # Weighted averaging of probabilities
        ensemble_preds = (w1 * preds + w2 * nn_preds)
        binary_preds = (ensemble_preds >= 0.5).astype(int)
        
        # Calculate metrics
        accuracy = accuracy_score(y, binary_preds)
        auc = roc_auc_score(y, ensemble_preds) if 'predict_proba' in dir(model) else None
        recall = recall_score(y, binary_preds)
        precision = precision_score(y, binary_preds)
        f1 = f1_score(y, binary_preds)
        
        print(f"Model ({model_name}) with Weight ({w1}, {w2}): Accuracy = {accuracy}, AUC = {auc}, Recall = {recall}, Precision = {precision}, F1-Score = {f1}")
        
        # Update best metrics based on F1-score
        if not best_metrics or f1 > best_metrics['f1']:
            best_metrics = {
                'model': model_name,
                'weight': (w1, w2),
                'accuracy': accuracy,
                'auc': auc,
                'recall': recall,
                'precision': precision,
                'f1': f1
            }

print("\nBest model and weight combination and metrics:")
print(f"Model: {best_metrics['model']}")
print(f"Weight: {best_metrics['weight']}")
print(f"Accuracy: {best_metrics['accuracy']}")
print(f"AUC: {best_metrics['auc']}")
print(f"Recall: {best_metrics['recall']}")
print(f"Precision: {best_metrics['precision']}")
print(f"F1-Score: {best_metrics['f1']}")


Model (RandomForest) with Weight (0.7, 0.3): Accuracy = 0.8690301787283914, AUC = 0.8994477489182051, Recall = 0.7073286052009456, Precision = 0.84472049689441, F1-Score = 0.7699433865156974
Model (RandomForest) with Weight (0.5, 0.5): Accuracy = 0.8671256958687372, AUC = 0.8994179409088834, Recall = 0.6940898345153664, Precision = 0.8495370370370371, F1-Score = 0.7639864689045017
Model (RandomForest) with Weight (0.3, 0.7): Accuracy = 0.8584822736595371, AUC = 0.8975971432485612, Recall = 0.6628841607565011, Precision = 0.8471299093655589, F1-Score = 0.7437665782493369

Best model and weight combination and metrics:
Model: RandomForest
Weight: (0.7, 0.3)
Accuracy: 0.8690301787283914
AUC: 0.8994477489182051
Recall: 0.7073286052009456
Precision: 0.84472049689441
F1-Score: 0.7699433865156974
