In [19]:
# packages
import pandas as pd
from mod02_build_bot_predictor import train_model

### Define a function to extract predictions from the model

In [20]:
def predict_bot(df, model=None):
    """
    Predict whether each account is a bot (1) or human (0).
    """
    if model is None:
        model = train_model()

    preds = model.predict(df)
    return pd.Series(preds, index=df.index)

### Define a function to evaluate model error

In [21]:
def confusion_matrix_and_metrics(y_true, y_pred):
    """
    Computes confusion matrix and common error rates for binary classification.

    Assumes labels:
      0 = negative class
      1 = positive class

    Returns:
      dict with:
        tn, fp, fn, tp
        misclassification_rate
        false_positive_rate
        false_negative_rate
    """
    tn = fp = fn = tp = 0

    for yt, yp in zip(y_true, y_pred):
        if yt == 0 and yp == 0:
            tn += 1
        elif yt == 0 and yp == 1:
            fp += 1
        elif yt == 1 and yp == 0:
            fn += 1
        elif yt == 1 and yp == 1:
            tp += 1
        else:
            raise ValueError("Labels must be 0 or 1")

    total = tn + fp + fn + tp

    misclassification_rate = (fp + fn) / total if total > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0.0

    return {
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "misclassification_rate": misclassification_rate,
        "false_positive_rate": false_positive_rate,
        "false_negative_rate": false_negative_rate,
    }


### Load the data

In [22]:
TRAIN_PATH = "mod02_data/train.csv"
train = pd.read_csv(TRAIN_PATH)

TEST_PATH = "mod02_data/test.csv"
test = pd.read_csv(TEST_PATH)

### Format the data by independent vs. dependent variables

In [23]:
X_train = train.drop(columns=["is_bot"])
y_train = train['is_bot']

X_test = test.drop(columns=["is_bot"])
y_test = test['is_bot']

### Build the model on training data

In [24]:
model = train_model(X_train, y_train)

### Get the model predictions on training and test data

In [25]:
y_pred_train = predict_bot(X_train, model)
y_pred_test = predict_bot(X_test, model)

### Check results on the training set (data used to build the model)

In [26]:
confusion_matrix_and_metrics(y_train, y_pred_train)

{'tp': 106,
 'tn': 2595,
 'fp': 42,
 'fn': 257,
 'misclassification_rate': 0.09966666666666667,
 'false_positive_rate': 0.015927189988623434,
 'false_negative_rate': 0.7079889807162535}

### Check results on the test set (new data not yet seen by the model)

In [27]:
confusion_matrix_and_metrics(y_test, y_pred_test)

{'tp': 31,
 'tn': 860,
 'fp': 14,
 'fn': 95,
 'misclassification_rate': 0.109,
 'false_positive_rate': 0.016018306636155607,
 'false_negative_rate': 0.753968253968254}

# Discussion Questions

### Based on the misclassification rate of your model, discuss your confidence in the ability to predict a bot. 

My confidience is initially solid because of the 10.9% misclassification rate, indicating 86.4% accuraccy. With this being said, the 89.1% false negative rate definitely shakes my confidence up. Overall I don't have much confidence in the model's ability to actually detect bots when they are present

### What are potential ramifications of false positives from the model?

False positives can incorrectly labels real human users as bots which leads to users getting banned or having account restrictions, negatively impacting their experience. There were 14 false positives in the test set, meaning about 1.6 percent of users would be incorrectly flagged.

### What are potential ramifications of false negatives from the model?

Missing actual bots is a major problem since once they enter undetected the bot can spread misinformation, spam users, and more ill-advised activity. If this were to actually be deployed as a model the platform is vulnerable to bot attacks.