In [4]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, mean_squared_error, 
                             confusion_matrix, classification_report)
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from datetime import datetime
from functools import partial
import shap
import itertools
import math
import json
import os
import warnings
import re
import pickle
import time

# LightGBM and XGBoost
import lightgbm as lgb
import xgboost as xgb
from xgboost import DMatrix

# TensorFlow / Keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Clustering
from sklearn.cluster import KMeans

# Displaying Images
from IPython.display import Image

# Ignore Warnings
warnings.filterwarnings("ignore")



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load Data
df = pd.read_csv("cleaned data/cleaned_data_split.csv")

predictive_cols_log = ['default_profile', 'default_profile_image',
        'geo_enabled', 'deviation_from_humans', 'location', 'verified',
        'account_age_days', 'is_description_na', 'is_lang_na', 'is_lang_en',
       'is_location_unknown', 'creation_hour', 'creation_day_of_week',
       'creation_month', 'creation_year', 'is_weekend', 'creation_quarter',
       'part_of_day', 'creation_week_of_year', 'is_beginning_of_month',
       'is_end_of_month', 'description_length', 'influencer_type',
       'favourites_per_day', 'favourites_activity',
       'mention_count', 'log_favourites_count', 'log_followers_count', 'log_friends_count',
       'log_statuses_count', 'log_average_tweets_per_day',
       'log_fol_to_friends_ratio', 'log_fol_to_tweets_ratio',
       'log_friends_to_tweets_ratio','account_type']

features = [col for col in predictive_cols_log if col not in ['id', 'account_type', 'X_fold']]
target = 'account_type'

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pandas as pd

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features.astype('float32').values, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'features': self.features[idx],
            'label': self.labels[idx]
        }

class AccountTypeClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 64], num_classes=2, dropout_rate=0.3):
        super(AccountTypeClassifier, self).__init__()
        
        # Build layers dynamically
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
        
        # Output layer
        self.layers = nn.Sequential(*layers)
        self.output = nn.Linear(prev_dim, num_classes)
        
    def forward(self, x):
        x = self.layers(x)
        return self.output(x)

def prepare_data(mds, predictive_cols, target_cols):
    # Encode target variable if needed
    le = LabelEncoder()
    y = le.fit_transform(mds[target_cols[0]])
    
    # Split data based on X_fold column
    train_data = mds[mds['X_fold'] == 'train']
    valid_data = mds[mds['X_fold'] == 'valid']
    test_data = mds[mds['X_fold'] == 'test']
    oot_data = mds[mds['X_fold'] == 'oot']
    
    # Prepare features and labels for each split
    X_train = train_data[predictive_cols]
    y_train = y[mds['X_fold'] == 'train']
    
    X_valid = valid_data[predictive_cols]
    y_valid = y[mds['X_fold'] == 'valid']
    
    X_test = test_data[predictive_cols]
    y_test = y[mds['X_fold'] == 'test']
    
    X_oot = oot_data[predictive_cols]
    y_oot = y[mds['X_fold'] == 'oot']
    
    return (X_train, y_train), (X_valid, y_valid), (X_test, y_test), (X_oot, y_oot), le

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device='cpu'):
    best_val_loss = float('inf')
    best_model = None
    history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        correct_train = 0
        total_train = 0
        
        for batch in train_loader:
            features = batch['features'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()
        
        # Validation phase
        model.eval()
        val_loss = 0
        correct_val = 0
        total_val = 0
        
        with torch.no_grad():
            for batch in val_loader:
                features = batch['features'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(features)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels).sum().item()
        
        # Calculate metrics
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        train_acc = correct_train / total_train
        val_acc = correct_val / total_val
        
        # Save metrics
        history['train_loss'].append(avg_train_loss)
        history['val_loss'].append(avg_val_loss)
        history['train_acc'].append(train_acc)
        history['val_acc'].append(val_acc)
        
        print(f'Epoch [{epoch+1}/{num_epochs}]')
        print(f'Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f}')
        print(f'Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.4f}')
        
        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model = model.state_dict()
    
    return best_model, history

def evaluate_model(model, test_loader, criterion, device='cpu'):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            features = batch['features'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(features)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(test_loader)
    accuracy = correct / total
    
    return avg_loss, accuracy, all_preds, all_labels

# Example usage
def main(mds, predictive_cols, target_cols, params=None):
    if params is None:
        params = {
            'batch_size': 32,
            'hidden_dims': [128, 64],
            'learning_rate': 0.001,
            'num_epochs': 10,
            'dropout_rate': 0.3
        }
    
    # Prepare data
    (X_train, y_train), (X_valid, y_valid), (X_test, y_test), (X_oot, y_oot), le = prepare_data(
        mds, predictive_cols, target_cols
    )
    
    # Create datasets
    train_dataset = CustomDataset(X_train, pd.Series(y_train))
    valid_dataset = CustomDataset(X_valid, pd.Series(y_valid))
    test_dataset = CustomDataset(X_test, pd.Series(y_test))
    oot_dataset = CustomDataset(X_oot, pd.Series(y_oot))
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=params['batch_size'])
    test_loader = DataLoader(test_dataset, batch_size=params['batch_size'])
    oot_loader = DataLoader(oot_dataset, batch_size=params['batch_size'])
    
    # Initialize model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = AccountTypeClassifier(
        input_dim=len(predictive_cols),
        hidden_dims=params['hidden_dims'],
        num_classes=len(le.classes_),
        dropout_rate=params['dropout_rate']
    ).to(device)
    
    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])
    
    # Train model
    best_model_state, history = train_model(
        model, train_loader, valid_loader, criterion, optimizer,
        num_epochs=params['num_epochs'], device=device
    )
    
    # Load best model for evaluation
    model.load_state_dict(best_model_state)
    
    # Evaluate on test and OOT sets
    test_loss, test_acc, test_preds, test_labels = evaluate_model(
        model, test_loader, criterion, device
    )
    oot_loss, oot_acc, oot_preds, oot_labels = evaluate_model(
        model, oot_loader, criterion, device
    )
    
    return {
        'model': model,
        'history': history,
        'test_metrics': {'loss': test_loss, 'accuracy': test_acc, 'recall': recall_score(test_labels,test_preds), 'precision': precision_score(test_labels,test_preds), 'f1-score':f1_score(test_labels,test_preds)},
        'oot_metrics': {'loss': oot_loss, 'accuracy': oot_acc, 'recall': recall_score(oot_labels, oot_preds), 'precision': precision_score(oot_labels, oot_preds), 'f1-score':f1_score(oot_labels, oot_preds)},
        'label_encoder': le
    }


In [8]:
# Running Model

results = main(
    mds=df,  
    predictive_cols=features,  
    target_cols=[target], 
    params={
        'batch_size': 32,
        'hidden_dims': [128, 64],
        'learning_rate': 0.001,
        'num_epochs': 15,
        'dropout_rate': 0.3
    }
)



Epoch [1/15]
Train Loss: 0.5591, Train Acc: 0.7102
Val Loss: 0.5860, Val Acc: 0.7065
Epoch [2/15]
Train Loss: 0.5008, Train Acc: 0.7611
Val Loss: 0.4931, Val Acc: 0.7755
Epoch [3/15]
Train Loss: 0.4754, Train Acc: 0.7798
Val Loss: 0.7045, Val Acc: 0.5755
Epoch [4/15]
Train Loss: 0.4699, Train Acc: 0.7844
Val Loss: 0.4693, Val Acc: 0.7870
Epoch [5/15]
Train Loss: 0.4573, Train Acc: 0.7890
Val Loss: 0.4495, Val Acc: 0.8061
Epoch [6/15]
Train Loss: 0.4534, Train Acc: 0.7944
Val Loss: 0.4696, Val Acc: 0.7876
Epoch [7/15]
Train Loss: 0.4476, Train Acc: 0.7995
Val Loss: 0.4434, Val Acc: 0.8044
Epoch [8/15]
Train Loss: 0.4475, Train Acc: 0.7972
Val Loss: 0.4395, Val Acc: 0.8066
Epoch [9/15]
Train Loss: 0.4445, Train Acc: 0.8009
Val Loss: 0.4458, Val Acc: 0.8063
Epoch [10/15]
Train Loss: 0.4415, Train Acc: 0.8029
Val Loss: 0.4525, Val Acc: 0.8028
Epoch [11/15]
Train Loss: 0.4408, Train Acc: 0.8023
Val Loss: 0.5506, Val Acc: 0.7077
Epoch [12/15]
Train Loss: 0.4345, Train Acc: 0.8048
Val Loss: 0

# Replicability

- Save Model
- Save Performance Metrics

In [9]:
def save_model(model, output_dir, model_type, model_name='_best_model.pkl'):
    """Save the trained XGBoost model to a pickle file."""
    with open(f"{output_dir}/{model_type + model_name}", 'wb') as f:
        pickle.dump(model, f)
save_model(results['model'], output_dir = 'output_files', model_type = 'nn')

In [10]:
data = [
    {
        'model': 'RandomForest',
        'test_metrics': results['test_metrics'],
        'oot_metrics': results['oot_metrics']
    }
]
df_results = pd.DataFrame(data)

# Save to CSV
df_results.to_csv('output_files/model_results_nn.csv', index=False)