In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, mean_squared_error, 
                             confusion_matrix, classification_report)
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from datetime import datetime
from functools import partial
import shap
import itertools
import math
import json
import os
import warnings
import re
import pickle
import time

# LightGBM and XGBoost
import lightgbm as lgb
import xgboost as xgb
from xgboost import DMatrix

# TensorFlow / Keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Clustering
from sklearn.cluster import KMeans

# Displaying Images
from IPython.display import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# Ignore Warnings
warnings.filterwarnings("ignore")



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Data
df = pd.read_csv("cleaned data/cleaned_data_split.csv")

mds = df[df['X_fold'] == 'test']

In [4]:
def load_model(model_path):
    with open(model_path, 'rb') as f:
        return pickle.load(f)


In [5]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features.astype('float32').values, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'features': self.features[idx],
            'label': self.labels[idx]
        }

class AccountTypeClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 64], num_classes=2, dropout_rate=0.3):
        super(AccountTypeClassifier, self).__init__()
        
        # Build layers dynamically
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
        
        # Output layer
        self.layers = nn.Sequential(*layers)
        self.output = nn.Linear(prev_dim, num_classes)
        
    def forward(self, x):
        x = self.layers(x)
        return self.output(x)


In [7]:
modelrf = load_model('output_files/{0}_best_model.pkl'.format('rf'))
modelxgb = load_model('output_files/{0}_best_model.pkl'.format('xgb'))
modellgbm = load_model('output_files/{0}_best_model.pkl'.format('lgbm'))
modelnn = load_model('output_files/{0}_best_model.pkl'.format('nn'))


In [8]:
predictive_cols_log = ['default_profile', 'default_profile_image',
        'geo_enabled', 'deviation_from_humans', 'location', 'verified',
        'account_age_days', 'is_description_na', 'is_lang_na', 'is_lang_en',
       'is_location_unknown', 'creation_hour', 'creation_day_of_week',
       'creation_month', 'creation_year', 'is_weekend', 'creation_quarter',
       'part_of_day', 'creation_week_of_year', 'is_beginning_of_month',
       'is_end_of_month', 'description_length', 'influencer_type',
       'favourites_per_day', 'favourites_activity',
       'mention_count', 'log_favourites_count', 'log_followers_count', 'log_friends_count',
       'log_statuses_count', 'log_average_tweets_per_day',
       'log_fol_to_friends_ratio', 'log_fol_to_tweets_ratio',
       'log_friends_to_tweets_ratio', 'sentiment_label', 'account_type']

features = [col for col in predictive_cols_log if col not in ['id', 'account_type', 'X_fold']]
target = ['account_type']

### Ensembling Method for Model Combination

The ensembling method aims to combine multiple predictive models, specifically traditional models (XGBoost, LightGBM, and optionally RandomForest) and a neural network model, to improve performance. The ensemble strategy involves weighted averaging of the predictions from the traditional models and the neural network model. Below is a summary of the approach:

1. **Model Pairing**:
   - The ensemble combines two types of models: traditional machine learning models (XGBoost, LightGBM, etc.) and a neural network model. These models are paired in various combinations for testing.
   - In this example, the models used are:
     - XGBoost (`modelxgb`)
     - LightGBM (`modellgbm`)
     - Neural Network (`modelnn`)

2. **Weighting**:
   - Different weight combinations are tested for the ensemble predictions. The weights determine how much influence each model's predictions have in the final combined prediction. In the example, the following weight combinations are considered:
     - (0.7, 0.3)
     - (0.5, 0.5)
     - (0.3, 0.7)
   - `w1` and `w2` correspond to the weights assigned to the traditional model predictions and neural network predictions, respectively.

3. **Prediction Process**:
   - For each model, out-of-fold predictions are generated using 5-fold cross-validation.
   - For neural network predictions, the model outputs probabilities (for binary classification) that are used in the ensemble.

4. **Ensemble Calculation**:
   - The predictions from the traditional model (`preds`) and the neural network (`nn_preds`) are combined using the weighted average formula:
     \[
     \text{ensemble\_preds} = (w1 \times \text{preds}) + (w2 \times \text{nn\_preds})
     \]
   - The combined prediction is then thresholded (at 0.5) to convert probabilities into binary labels (0 or 1).

5. **Metrics**:
   - The performance of the ensemble is evaluated using several classification metrics:
     - **Accuracy**: The proportion of correct predictions.
     - **AUC (Area Under the Curve)**: A measure of the model's ability to distinguish between the classes.
     - **Recall**: The proportion of actual positives correctly identified by the model.
     - **Precision**: The proportion of predicted positives that are actually correct.
     - **F1-Score**: The harmonic mean of precision and recall.

6. **Best Model and Weight Combination**:
   - After testing all combinations, the best model and weight combination is selected based on the highest F1-Score. This combination is considered the most balanced, ensuring both high precision and recall.

7. **Output**:
   - The model, weight combination, and corresponding metrics (Accuracy, AUC, Recall, Precision, F1-Score) are printed for each iteration.
   - Finally, the best model and weight combination are displayed, showing the highest performance across the evaluated metrics.



In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import numpy as np
import torch

# Define weight combinations and model pairs
weights = [(0.7, 0.3), (0.5, 0.5), (0.3, 0.7)]
model_pairs = [('XGBoost', modelxgb), ('LightGBM', modellgbm)]
#model_pairs = [('RandomForest', modelrf), ('XGBoost', modelxgb), ('LightGBM', modellgbm)]
nn_model = modelnn  # Neural network model instance

# Prepare the data
X = mds[features]
y = mds[target]
X_tensor = torch.tensor(X.values, dtype=torch.float32)  # Convert X to a tensor for NN model

# Get predictions from the neural network model
with torch.no_grad():
    nn_outputs = nn_model(X_tensor).numpy()  # Assuming nn_model outputs probabilities
    nn_preds = nn_outputs[:, 1]  # Adjust index for binary classification

# Initialize variables to track the best combination and metrics
best_combination = None
best_metrics = {}

# Iterate through each model pair
for model_name, model in model_pairs:
    # Get out-of-fold predictions for the chosen traditional model
    try:
        preds = cross_val_predict(model, X, y, cv=5, method='predict_proba')[:, 1]
    except IndexError:
        preds = cross_val_predict(model, X, y, cv=5, method='predict')
    
    # Iterate through each weight combination
    for w1, w2 in weights:
        # Weighted averaging of probabilities
        ensemble_preds = (w1 * preds + w2 * nn_preds)
        binary_preds = (ensemble_preds >= 0.5).astype(int)
        
        # Calculate metrics
        accuracy = accuracy_score(y, binary_preds)
        auc = roc_auc_score(y, ensemble_preds) if 'predict_proba' in dir(model) else None
        recall = recall_score(y, binary_preds)
        precision = precision_score(y, binary_preds)
        f1 = f1_score(y, binary_preds)
        
        print(f"Model ({model_name}) with Weight ({w1}, {w2}): Accuracy = {accuracy}, AUC = {auc}, Recall = {recall}, Precision = {precision}, F1-Score = {f1}")
        
        # Update best metrics based on F1-score
        if not best_metrics or f1 > best_metrics['f1']:
            best_metrics = {
                'model': model_name,
                'weight': (w1, w2),
                'accuracy': accuracy,
                'auc': auc,
                'recall': recall,
                'precision': precision,
                'f1': f1
            }

print("\nBest model and weight combination and metrics:")
print(f"Model: {best_metrics['model']}")
print(f"Weight: {best_metrics['weight']}")
print(f"Accuracy: {best_metrics['accuracy']}")
print(f"AUC: {best_metrics['auc']}")
print(f"Recall: {best_metrics['recall']}")
print(f"Precision: {best_metrics['precision']}")
print(f"F1-Score: {best_metrics['f1']}")


# THE CHUNK BELOW IS FOR TEAM FYI NOT TO BE SUBMITTED

The chunk above is an ensemble method used with neural network and tree based models, currently takes too long to run, i have not been able to run it successfully.
IF issue persists, we can use the ensemble method below but it only ensembles amongst tree based models does not include neural network.

In [8]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import numpy as np

# Define weight combinations
weights = [(0.7, 0.3), (0.5, 0.5), (0.3, 0.7)]


model1 = modelrf  # Example model
model2 = modelxgb  # Example model

# Prepare the data
X = mds[features]
y = mds[target]

# Get out-of-fold predictions for both models (as binary if predict_proba not available)
try:
    # If models support predict_proba
    preds1 = cross_val_predict(model1, X, y, cv=5, method='predict_proba')[:, 1]
    preds2 = cross_val_predict(model2, X, y, cv=5, method='predict_proba')[:, 1]
except IndexError:
    # Use binary predictions instead
    preds1 = cross_val_predict(model1, X, y, cv=5, method='predict')
    preds2 = cross_val_predict(model2, X, y, cv=5, method='predict')

# Initialize variables to track the best weight and metrics
best_weight = None
best_metrics = {}

# Evaluate each weight combination
for w1, w2 in weights:
    # Apply weighted averaging (if using predict_proba) or majority voting
    if 'predict_proba' in dir(model1) and 'predict_proba' in dir(model2):
        ensemble_preds = (w1 * preds1 + w2 * preds2)
        binary_preds = (ensemble_preds >= 0.5).astype(int)
    else:
        # Majority voting for binary predictions
        ensemble_preds = (w1 * preds1 + w2 * preds2).round().astype(int)
        binary_preds = ensemble_preds
    
    # Calculate metrics
    accuracy = accuracy_score(y, binary_preds)
    try:
        auc = roc_auc_score(y, ensemble_preds)  # Only for probability-based predictions
    except ValueError:
        auc = None
    recall = recall_score(y, binary_preds)
    precision = precision_score(y, binary_preds)
    f1 = f1_score(y, binary_preds)
    
    print(f"Weight ({w1}, {w2}): Accuracy = {accuracy}, AUC = {auc}, Recall = {recall}, Precision = {precision}, F1-Score = {f1}")

    # Update best metrics based on F1-score
    if not best_metrics or f1 > best_metrics['f1']:
        best_metrics = {
            'weight': (w1, w2),
            'accuracy': accuracy,
            'auc': auc,
            'recall': recall,
            'precision': precision,
            'f1': f1
        }

print("\nBest weight combination and metrics:")
print(f"Weight: {best_metrics['weight']}")
print(f"Accuracy: {best_metrics['accuracy']}")
print(f"AUC: {best_metrics['auc']}")
print(f"Recall: {best_metrics['recall']}")
print(f"Precision: {best_metrics['precision']}")
print(f"F1-Score: {best_metrics['f1']}")


predictproba
predict
Weight (0.7, 0.3): Accuracy = 0.8546663817511619, AUC = 0.845350696913259, Recall = 0.6844265593561368, Precision = 0.8483639265762171, F1-Score = 0.757628402155998
Weight (0.5, 0.5): Accuracy = 0.8621988354078743, AUC = 0.8457285270009696, Recall = 0.7501810865191146, Precision = 0.8193565400843882, F1-Score = 0.7832444014957355
Weight (0.3, 0.7): Accuracy = 0.8669266520647471, AUC = 0.8461063570886801, Recall = 0.7244265593561369, Precision = 0.8524481484989109, F1-Score = 0.7832405151409676

Best weight combination and metrics:
Weight: (0.5, 0.5)
Accuracy: 0.8621988354078743
AUC: 0.8457285270009696
Recall: 0.7501810865191146
Precision: 0.8193565400843882
F1-Score: 0.7832444014957355
