In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)


# Import file here

In [None]:
# Load Data
df = pd.read_csv("/Users/wenwei/Documents/Sku/y4s1/bt4222/project/twitter_human_bots_dataset.csv")


In [None]:

# Convert 'created_at' to datetime
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

# Calculate Account Age
current_time = datetime.now()
df['account_age_days'] = (current_time - df['created_at']).dt.days

# Time-Based Features
df['creation_hour'] = df['created_at'].dt.hour
df['creation_day_of_week'] = df['created_at'].dt.dayofweek
df['creation_month'] = df['created_at'].dt.month
df['creation_year'] = df['created_at'].dt.year
df['creation_quarter'] = df['created_at'].dt.quarter
df['is_weekend'] = df['creation_day_of_week'] >= 5
df['creation_week_of_year'] = df['created_at'].dt.isocalendar().week
df['is_beginning_of_month'] = df['created_at'].dt.day <= 5
df['is_end_of_month'] = df['created_at'].dt.day >= 26

# Define part of day based on hour
def part_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

df['part_of_day'] = df['creation_hour'].apply(part_of_day)

# Additional Features
humans_mean = df[df['account_type'] == 'human']['average_tweets_per_day'].mean()
humans_std = df[df['account_type'] == 'human']['average_tweets_per_day'].std()
df['deviation_from_humans'] = (df['average_tweets_per_day'] - humans_mean) / humans_std

# Description Length Feature
df['description_length'] = df['description'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)

# Followers/Friends Ratios
df['followers_to_friends_ratio'] = df['followers_count'] / df['friends_count']
df['followers_to_friends_ratio'].fillna(0, inplace=True)

# Followers to Tweets Per Day Ratio
df['followers_to_tweets_per_day_ratio'] = df['followers_count'] / df['average_tweets_per_day']
df['followers_to_tweets_per_day_ratio'].fillna(0, inplace=True)

# Mentions Count in Description
import re

def extract_mentions(description):
    return re.findall(r'@\w+', str(description))

df['mentions'] = df['description'].apply(extract_mentions)
df['mention_count'] = df['mentions'].apply(len)

# Ensure any remaining NaN values are filled if necessary
########################################################################
# TO DISCUSS METHOD OF IMPUTATION
########################################################################
df.fillna(0, inplace=True)
df.replace(np.inf,0, inplace = True)

# ENCODING METHOD FOR LANG AND LOCATION TO BE DISCUSSSED

In [None]:
# Encoding Categorical Features
df['account_type'] = df['account_type'].map({'human': 0, 'bot': 1})

encode_cols = ['default_profile', 'default_profile_image', 'geo_enabled', 'lang', 'location', 'verified',
               'creation_year', 'is_weekend', 'is_beginning_of_month', 'is_end_of_month', 'part_of_day']

label_encoder = LabelEncoder()
for col in encode_cols:
    df[col] = label_encoder.fit_transform(df[col].astype(str))  # Convert to string to handle NaNs if any


# Define Feature Columns and Target
id_col = ['id']
labels = ['account_type']
predictive_cols = ['default_profile', 'default_profile_image', 'favourites_count', 'followers_count', 'friends_count',
                   'geo_enabled', 'lang', 'location', 'statuses_count', 'verified', 'average_tweets_per_day', 
                   'account_age_days', 'creation_hour', 'creation_day_of_week', 'creation_month', 'creation_year',
                   'creation_quarter', 'is_weekend', 'creation_week_of_year', 'is_beginning_of_month', 
                   'is_end_of_month', 'part_of_day', 'deviation_from_humans', 'description_length', 
                   'followers_to_friends_ratio', 'followers_to_tweets_per_day_ratio', 'mention_count','account_type']



In [None]:
# Set cutoff date for training/validation split
cutoff_date = pd.to_datetime('2017-01-01')
df['date'] = pd.to_datetime(df['created_at'])

# Split data based on cutoff date
oot = df[df['date'] >= cutoff_date].set_index('id')
df_model = df[df['date'] < cutoff_date].set_index('id')

# Train, Test, Validation Splits
RANDOM_SEED = 2024
train, test = train_test_split(df_model, test_size=0.2, random_state=RANDOM_SEED)
train, valid = train_test_split(train, test_size=0.2, random_state=RANDOM_SEED)

# Add 'X_fold' columns for each split
train['X_fold'] = 'train'
test['X_fold'] = 'test'
valid['X_fold'] = 'valid'
oot['X_fold'] = 'oot'

# Combine all datasets for modeling
mds = pd.concat([train, test, valid, oot]).copy(deep=True)
mds = mds.reset_index()

# Plot correlation heatmap for predictive columns
corr = mds[predictive_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap of Predictive Features')
plt.show()

# Check target distribution
print(mds['account_type'].value_counts())

# Final Data Overview
print(mds.head())
print(mds.info())

In [None]:
import itertools
import math
from hyperopt import hp
import xgboost as xgb
import lightgbm as lgb
from IPython.display import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import re
from sklearn.model_selection import train_test_split
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Import custom functions
from utils import mpr_report, final_fitting

from lmf2 import LoadModelFunction#, final_fitting

import os

In [None]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import time
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import json
from functools import partial

def objective_lgb(space, train_data, valid_data, X_test, y_test, X_oot, y_oot):
    """Objective function for LightGBM optimization"""
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': int(space['num_leaves']),
        'learning_rate': space['learning_rate'],
        'feature_fraction': space['feature_fraction'],
        'bagging_fraction': space['bagging_fraction'],
        'bagging_freq': int(space['bagging_freq']),
        'min_child_samples': int(space['min_child_samples']),
        'max_depth': int(space['max_depth']),
        'verbose': -1,
        'random_state': 2024
    }
    
    try:
        model = lgb.train(
            params,
            train_data,
            num_boost_round=1000,
            valid_sets=[train_data, valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(period=100)
            ]
        )
        
        pred_test = model.predict(X_test)
        pred_oot = model.predict(X_oot)
        
        test_auc = roc_auc_score(y_test, pred_test)
        oot_auc = roc_auc_score(y_oot, pred_oot)
        
        return {
            'loss': -test_auc,
            'status': STATUS_OK,
            'model': model,
            'test_auc': test_auc,
            'oot_auc': oot_auc,
            'params': params,
            'model_type': 'lgb'
        }
    except Exception as e:
        print(f"Error in LightGBM training: {str(e)}")
        return {'loss': 0, 'status': STATUS_OK, 'model': None, 'test_auc': 0, 'oot_auc': 0, 'params': params, 'model_type': 'lgb'}

def objective_xgb(space, X_train, y_train, X_valid, y_valid, X_test, y_test, X_oot, y_oot):
    """Objective function for XGBoost optimization"""
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth': int(space['max_depth']),
        'learning_rate': space['learning_rate'],
        'subsample': space['subsample'],
        'colsample_bytree': space['colsample_bytree'],
        'min_child_weight': int(space['min_child_weight']),
        'n_estimators': int(space['n_estimators']),
        'random_state': 2024,
        'use_label_encoder': False  # Prevent warning about label encoder
    }
    
    try:
        model = xgb.XGBClassifier(**params)
        
        # Fit model
        model.fit(
            X_train, 
            y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            verbose=False
        )
        
        # Make predictions
        pred_test = model.predict_proba(X_test)[:, 1]
        pred_oot = model.predict_proba(X_oot)[:, 1]
        
        test_auc = roc_auc_score(y_test, pred_test)
        oot_auc = roc_auc_score(y_oot, pred_oot)
        
        return {
            'loss': -test_auc,
            'status': STATUS_OK,
            'model': model,
            'test_auc': test_auc,
            'oot_auc': oot_auc,
            'params': params,
            'model_type': 'xgb'
        }
    except Exception as e:
        print(f"Error in XGBoost training: {str(e)}")
        return {'loss': 0, 'status': STATUS_OK, 'model': None, 'test_auc': 0, 'oot_auc': 0, 
                'params': params, 'model_type': 'xgb'}

def objective_rf(space, X_train, y_train, X_valid, y_valid, X_test, y_test, X_oot, y_oot):
    """Objective function for Random Forest optimization"""
    params = {
        'n_estimators': int(space['n_estimators']),
        'max_depth': int(space['max_depth']) if space['max_depth'] is not None else None,
        'min_samples_split': int(space['min_samples_split']),
        'min_samples_leaf': int(space['min_samples_leaf']),
        'max_features': space['max_features'],
        'random_state': 2024,
        'n_jobs': -1
    }
    
    try:
        model = RandomForestClassifier(**params)
        model.fit(X_train, y_train)
        
        pred_test = model.predict_proba(X_test)[:, 1]
        pred_oot = model.predict_proba(X_oot)[:, 1]
        
        test_auc = roc_auc_score(y_test, pred_test)
        oot_auc = roc_auc_score(y_oot, pred_oot)
        
        return {
            'loss': -test_auc,
            'status': STATUS_OK,
            'model': model,
            'test_auc': test_auc,
            'oot_auc': oot_auc,
            'params': params,
            'model_type': 'rf'
        }
    except Exception as e:
        print(f"Error in Random Forest training: {str(e)}")
        return {'loss': 0, 'status': STATUS_OK, 'model': None, 'test_auc': 0, 'oot_auc': 0, 'params': params, 'model_type': 'rf'}

def hyperopt_tuning(mds, predictive_cols, max_evals=50, random_state=2024):
    """Perform hyperparameter tuning for all models using Hyperopt"""
    features = [col for col in predictive_cols if col != 'account_type']
    
    # Prepare datasets
    X_train = mds[mds['X_fold'] == 'train'][features]
    y_train = mds[mds['X_fold'] == 'train']['account_type']
    
    X_valid = mds[mds['X_fold'] == 'valid'][features]
    y_valid = mds[mds['X_fold'] == 'valid']['account_type']
    
    X_test = mds[mds['X_fold'] == 'test'][features]
    y_test = mds[mds['X_fold'] == 'test']['account_type']
    
    X_oot = mds[mds['X_fold'] == 'oot'][features]
    y_oot = mds[mds['X_fold'] == 'oot']['account_type']
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
    
    # Define search spaces for each model
    space_lgb = {
        'num_leaves': hp.quniform('num_leaves', 15, 127, 1),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
        'feature_fraction': hp.uniform('feature_fraction', 0.6, 0.9),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.6, 0.9),
        'bagging_freq': hp.quniform('bagging_freq', 2, 10, 1),
        'min_child_samples': hp.quniform('min_child_samples', 10, 150, 1),
        'max_depth': hp.quniform('max_depth', 3, 12, 1)
    }
    
    space_xgb = {
        'max_depth': hp.quniform('max_depth', 3, 12, 1),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
        'subsample': hp.uniform('subsample', 0.6, 1.0),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
        'min_child_weight': hp.quniform('min_child_weight', 1, 7, 1),
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 50)
    }
    
    space_rf = {
        'n_estimators': hp.quniform('n_estimators', 100, 500, 50),
        'max_depth': hp.choice('max_depth', [None] + list(range(10, 31, 2))),
        'min_samples_split': hp.quniform('min_samples_split', 2, 20, 1),
        'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
        'max_features': hp.choice('max_features', ['sqrt', 'log2'])
    }
    
    models_config = [
        {
            'name': 'LightGBM',
            'objective': partial(
                objective_lgb,
                train_data=train_data,
                valid_data=valid_data,
                X_test=X_test,
                y_test=y_test,
                X_oot=X_oot,
                y_oot=y_oot
            ),
            'space': space_lgb
        },
        {
            'name': 'XGBoost',
            'objective': partial(
                objective_xgb,
                X_train=X_train,
                y_train=y_train,
                X_valid=X_valid,
                y_valid=y_valid,
                X_test=X_test,
                y_test=y_test,
                X_oot=X_oot,
                y_oot=y_oot
            ),
            'space': space_xgb
        },
        {
            'name': 'Random Forest',
            'objective': partial(
                objective_rf,
                X_train=X_train,
                y_train=y_train,
                X_valid=X_valid,
                y_valid=y_valid,
                X_test=X_test,
                y_test=y_test,
                X_oot=X_oot,
                y_oot=y_oot
            ),
            'space': space_rf
        }
    ]
    
    best_models = {}
    all_trials_dfs = {}
    
    for model_config in models_config:
        print(f"\nOptimizing {model_config['name']}...")
        start_time = time.time()
        
        trials = Trials()
        best = fmin(
            fn=model_config['objective'],
            space=model_config['space'],
            algo=tpe.suggest,
            max_evals=max_evals,
            trials=trials
        )
        
        print(f"{model_config['name']} optimization completed in {(time.time() - start_time)/60:.2f} minutes")
        
        # Get best trial
        best_trial = sorted(trials.trials, key=lambda x: x['result']['loss'])[0]
        best_models[model_config['name']] = {
            'model': best_trial['result']['model'],
            'params': best_trial['result']['params'],
            'test_auc': best_trial['result']['test_auc'],
            'oot_auc': best_trial['result']['oot_auc']
        }
        
        # Convert trials to DataFrame
        trials_df = pd.DataFrame([
            {
                **trial['misc']['vals'],
                'test_auc': -trial['result']['loss'],
                'oot_auc': trial['result']['oot_auc']
            }
            for trial in trials.trials
        ])
        
        # Flatten lists in DataFrame
        for col in trials_df.columns:
            if isinstance(trials_df[col].iloc[0], list):
                trials_df[col] = trials_df[col].apply(lambda x: x[0])
        
        all_trials_dfs[model_config['name']] = trials_df
    
    # Compare models
    print("\nModel Comparison:")
    comparison_df = pd.DataFrame({
        model_name: {
            'Test AUC': info['test_auc'],
            'OOT AUC': info['oot_auc']
        }
        for model_name, info in best_models.items()
    }).T
    print(comparison_df)
    
    # Plot model comparison
    plt.figure(figsize=(10, 6))
    comparison_df.plot(kind='bar')
    plt.title('Model Performance Comparison')
    plt.ylabel('AUC Score')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return best_models, all_trials_dfs, comparison_df

def save_results(best_models, all_trials_dfs, comparison_df, filename_prefix='model_comparison'):
    """Save all models and results"""
    # Save models
    for model_name, model_info in best_models.items():
        if model_info['model'] is not None:
            if model_name == 'LightGBM':
                model_info['model'].save_model(f"{filename_prefix}_{model_name.lower()}_model.txt")
            else:
                # For XGBoost and Random Forest, use joblib
                import joblib
                joblib.dump(model_info['model'], f"{filename_prefix}_{model_name.lower()}_model.joblib")
    
    # Save best parameters
    with open(f"{filename_prefix}_best_params.json", 'w') as f:
        json.dump({name: info['params'] for name, info in best_models.items()}, f, indent=4)
    
    # Save all trials results
    for model_name, trials_df in all_trials_dfs.items():
        trials_df.to_csv(f"{filename_prefix}_{model_name.lower()}_trials.csv", index=False)
    
    # Save model comparison
    comparison_df.to_csv(f"{filename_prefix}_comparison.csv")
    
    print(f"Results saved with prefix: {filename_prefix}")

# Main execution
# Main execution
if __name__ == "__main__":
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    def plot_feature_importance(model, features, model_name, top_n=20):
        """Plot feature importance for a given model"""
        plt.figure(figsize=(12, 6))
        
        if model_name == 'LightGBM':
            importance = model.feature_importance(importance_type='gain')
            feat_imp = pd.DataFrame({'feature': features, 'importance': importance})
        elif model_name == 'XGBoost':
            importance = model.feature_importances_
            feat_imp = pd.DataFrame({'feature': features, 'importance': importance})
        else:  # Random Forest
            importance = model.feature_importances_
            feat_imp = pd.DataFrame({'feature': features, 'importance': importance})
        
        feat_imp = feat_imp.sort_values('importance', ascending=False)
        
        sns.barplot(x='importance', y='feature', data=feat_imp.head(top_n))
        plt.title(f'Top {top_n} Feature Importance - {model_name}')
        plt.tight_layout()
        plt.savefig(f'feature_importance_{model_name.lower()}.png')
        plt.close()
    
    def evaluate_model(model, X, y, model_name, dataset_name):
        """Evaluate a model on a given dataset"""
        if model_name == 'LightGBM':
            pred_proba = model.predict(X)
        else:
            pred_proba = model.predict_proba(X)[:, 1]
            
        pred_binary = (pred_proba > 0.5).astype(int)
        
        metrics = {
            'Accuracy': accuracy_score(y, pred_binary),
            'Precision': precision_score(y, pred_binary),
            'Recall': recall_score(y, pred_binary),
            'F1 Score': f1_score(y, pred_binary),
            'ROC AUC': roc_auc_score(y, pred_proba)
        }
        
        print(f"\n{model_name} - {dataset_name} Results:")
        for metric_name, value in metrics.items():
            print(f"{metric_name}: {value:.4f}")
        
        return metrics
    
    try:
        # Load your dataset (assuming you have a function or way to load it)
        # This should be replaced with your actual data loading code
        print("Loading dataset...")
        # mds = load_dataset()  # Replace with your data loading
        # predictive_cols = list(mds.columns)  # Adjust based on your actual columns
        
        # Set parameters for the hyperparameter optimization
        max_evals = 50  # Number of trials for each model
        random_state = 2024
        
        print(f"\nStarting hyperparameter optimization with {max_evals} evaluations per model...")
        
        # Run hyperparameter tuning for all models
        best_models, all_trials_dfs, comparison_df = hyperopt_tuning(
            mds=mds,  # Your dataset
            predictive_cols=predictive_cols,  # Your feature columns
            max_evals=max_evals,
            random_state=random_state
        )
        
        # Save all results
        save_results(best_models, all_trials_dfs, comparison_df)
        
        # Plot feature importance for each model
        features = [col for col in predictive_cols if col != 'account_type']
        for model_name, model_info in best_models.items():
            if model_info['model'] is not None:
                plot_feature_importance(model_info['model'], features, model_name)
        
        # Create detailed evaluation for each model
        datasets = {
            'Train': (X_train, y_train),
            'Valid': (X_valid, y_valid),
            'Test': (X_test, y_test),
            'OOT': (X_oot, y_oot)
        }
        
        all_metrics = {}
        for model_name, model_info in best_models.items():
            if model_info['model'] is not None:
                model_metrics = {}
                for dataset_name, (X, y) in datasets.items():
                    model_metrics[dataset_name] = evaluate_model(
                        model_info['model'],
                        X, y,
                        model_name,
                        dataset_name
                    )
                all_metrics[model_name] = model_metrics
        
        # Save detailed metrics
        with open('detailed_metrics.json', 'w') as f:
            json.dump(all_metrics, f, indent=4)
        
        # Plot learning curves from trials
        plt.figure(figsize=(12, 6))
        for model_name, trials_df in all_trials_dfs.items():
            plt.plot(trials_df['test_auc'].rolling(window=5).mean(), 
                    label=f'{model_name} (Test AUC)')
        plt.title('Learning Curves - Test AUC (Rolling Mean)')
        plt.xlabel('Trial')
        plt.ylabel('AUC Score')
        plt.legend()
        plt.tight_layout()
        plt.savefig('learning_curves.png')
        plt.close()
        
        # Print final model comparison
        print("\nFinal Model Comparison:")
        print(comparison_df)
        
        # Identify best overall model
        best_model_name = comparison_df['Test AUC'].idxmax()
        print(f"\nBest performing model: {best_model_name}")
        print(f"Best model parameters:")
        print(json.dumps(best_models[best_model_name]['params'], indent=2))
        
        print("\nOptimization pipeline completed successfully!")
        print("Results have been saved to files with detailed metrics and visualizations.")
        
    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        raise