In [None]:
# General 
import pandas as pd, numpy as np, requests, gc as gc, time as time, warnings
from collections import Counter

# Visualization
import matplotlib.pyplot as plt, seaborn as sns, plotly, plotly.express as px

# Other utilities
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import silhouette_samples, silhouette_score, classification_report, balanced_accuracy_score, f1_score

# Clustering
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import umap

# Sampling
from imblearn.over_sampling import ADASYN, SVMSMOTE
from imblearn.under_sampling import TomekLinks

# Modeling
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Perceptron
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

# Optimizers
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Feature Importances
import shap
shap.initjs()

# Other settings
gc.enable()
warnings.filterwarnings('ignore')

In [None]:
# Import the data from Discord
df = pd.read_csv('dataset.csv')
labels = pd.DataFrame(requests.get('http://45.33.127.106:5000/site/labels/id00wund3rW1thML').json())

In [None]:
df.head(n=5)

In [None]:
# Set up the data how we want it
merged_df = pd.merge(df, labels, how="left", left_on='label_id_x', right_on='id')
bot_df = merged_df.loc[merged_df['label'].isin([x for x in merged_df['label'] if '_bot' in x])]
final_df = bot_df.loc[:, ~bot_df.columns.isin([x for x in bot_df.columns if '_x' in x] + \
                                              ['Unnamed: 0', 'name', 'id', 'id_y', 
                                               'label_jagex_y', 'label_id_y', 'timestamp',
                                               'created_at', 'updated_at', 'ts_date_y'])]
classes = final_df['label']
final_df.drop('label', inplace=True, axis=1)

In [None]:
# Let's visualize some data

# Set some preliminary settings
plt.figure(figsize=(15,8))
sns.set_style("darkgrid")

for col in final_df.columns:
    sns.histplot(data=final_df, x=str(col), kde=True, 
                 stat='probability', bins=3).set(title='{} Probability Distribution'.format(str(col)))
    plt.show()
    plt.clf()

In [None]:
# Let's see how the data visualizes naturally
umapper = umap.UMAP()
umap_df = pd.DataFrame(data=umapper.fit_transform(StandardScaler().fit_transform(final_df)),
                       columns=['UMAP-1', 'UMAP-2'])

In [None]:
# Let's visualize the data as represented by UMAP
plt.figure(figsize=(25, 15))
sns.scatterplot(data=umap_df, x='UMAP-1', y='UMAP-2', hue=classes, palette="dark")

In [None]:
# We can do the same thing with PCA

# We can estimate the # of principal components needed
estimate_pca = PCA(n_components='mle').fit_transform(final_df)

# Let's also grab a 2D plot
plot_pca = PCA(n_components=2).fit_transform(final_df)
pca_df = pd.DataFrame(data=PCA(n_components=2).fit_transform(final_df),
                       columns=['PCA-1', 'PCA-2'])

In [None]:
# See how many components are needed to explain proper variance
print("Optimal # of components: {}".format(estimate_pca.shape[1]))
print("All components: {}".format(final_df.shape[1]))

In [None]:
# Visualize data in 2D using PCA
plt.figure(figsize=(25, 15))
sns.scatterplot(data=pca_df, x='PCA-1', y='PCA-2', hue=classes, palette="dark")

In [None]:
# Function to optimize KMeans algorithm
def optimize_kmeans(data, labels):
    k = []
    sil = []
    for cluster_count in range(2, len(set(labels))):
        kmeans = KMeans(n_clusters=cluster_count, random_state=30)
        # Append silhouette scores and cluster counts to some lists
        sil.append(silhouette_score(final_df, kmeans.fit_predict(final_df)))
        k.append(cluster_count)
    
    #return {k:v for k,v in zip(k, sil)}
    return k, sil

In [None]:
# Let's optimize the kmeans algorithm for our case
ks, sil_scores = optimize_kmeans(final_df, classes)

In [None]:
# Visualize the optimal # of clusters
plt.figure(figsize=(20,10))
sns.lineplot(x=ks, y=sil_scores).set(title="KMeans Silhouette Coefficient Analysis")

In [None]:
# Declare a list of models to test
classifiers = [LogisticRegression(), RidgeClassifier(), Perceptron(), GaussianNB(), MultinomialNB(),
               BernoulliNB(), MLPClassifier(), RandomForestClassifier(), AdaBoostClassifier(), 
               ExtraTreesClassifier(), DecisionTreeClassifier(), ExtraTreeClassifier(), XGBClassifier(),
               LGBMClassifier(), SVC()]

In [None]:
# First, let's run all the classifiers without doing any sampling
def run_classification(data, classes, classifiers):
    # Metrics to report on
    clfs = []
    bal_acc = []
    bal_f1 = []
    times = []
    
    X_train, X_test, y_train, y_test = train_test_split(data, classes, stratify=classes, random_state=30,
                                                       test_size=0.2)
    
    # Now test each classifier
    for clf in classifiers:
        # Get the name of the classifier
        name = clf.__class__.__name__
        
        # Record the start time for processing
        start = time.time()
        
        # Train the model and get predictions
        
        # Set up specific pipelines for certain models
        if name in ['MLPClassifier', 'Perceptron']:
            # MinMax scale the data
            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            
            clf.fit(X_train, y_train)
            preds = clf.predict(X_test)
        
        # Otherwise, if its a linear model, try t
        elif name in ['LogisticRegression', 'RidgeClassifier']:
            # Standardize the data
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
            
            clf.fit(X_train, y_train)
            preds = clf.predict(X_test)
        
        # Otherwise:
        else:
            clf.fit(X_train, y_train)
            preds = clf.predict(X_test)
        
        # Compute some metrics
        balanced_accuracy = balanced_accuracy_score(y_test, preds)
        balanced_f1 = f1_score(y_test, preds, average='weighted')
        train_time = round(time.time() - start, 3)
        
        # Create a pretty print string
        print_str = """
        Report for: {} , in {} seconds.
        
        Balanced Accuracy: {}
        Balanced F1 Score: {}
        
        {}
        
        """.format(name, train_time, balanced_accuracy,
                   balanced_f1, classification_report(y_test, preds))
        
        # Update the end user
        print(print_str)
        
        # Update metrics
        clfs.append(name)
        bal_acc.append(balanced_accuracy)
        bal_f1.append(balanced_f1)
        
        # Free up some memory
        del(name, train_time, balanced_accuracy, balanced_f1)
        gc.collect()

    # Return an analyzeable df
    return pd.DataFrame({'Classifier': clfs, 'Accuracy': bal_acc, 'F1': bal_f1})

In [None]:
# Set up a second function for some imbalanced classification
def run_imbalanced_classification(X_train, X_test, y_train, y_test, classifiers):
    # Metrics to report on
    clfs = []
    bal_acc = []
    bal_f1 = []
    times = []
    
    # Now test each classifier
    for clf in classifiers:
        # Get the name of the classifier
        name = clf.__class__.__name__
        
        # Record the start time for processing
        start = time.time()
        
        # Train the model and get predictions
        
        # Set up specific pipelines for certain models
        if name in ['MLPClassifier', 'Perceptron']:
            # MinMax scale the data
            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            
            clf.fit(X_train, y_train)
            preds = clf.predict(X_test)
        
        # Otherwise, if its a linear model, try t
        elif name in ['LogisticRegression', 'RidgeClassifier']:
            # Standardize the data
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.fit_transform(X_test)
            
            clf.fit(X_train, y_train)
            preds = clf.predict(X_test)
        
        # Otherwise:
        else:
            clf.fit(X_train, y_train)
            preds = clf.predict(X_test)
        
        # Compute some metrics
        balanced_accuracy = balanced_accuracy_score(y_test, preds)
        balanced_f1 = f1_score(y_test, preds, average='weighted')
        train_time = round(time.time() - start, 3)
        
        # Create a pretty print string
        print_str = """
        Report for: {} , in {} seconds.
        
        Balanced Accuracy: {}
        Balanced F1 Score: {}
        
        {}
        
        """.format(name, train_time, balanced_accuracy,
                   balanced_f1, classification_report(y_test, preds))
        
        # Update the end user
        print(print_str)
        
        # Update metrics
        clfs.append(name)
        bal_acc.append(balanced_accuracy)
        bal_f1.append(balanced_f1)
        
        # Free up some memory
        del(name, train_time, balanced_accuracy, balanced_f1)
        gc.collect()

    # Return an analyzeable df
    return pd.DataFrame({'Classifier': clfs, 'Accuracy': bal_acc, 'F1': bal_f1})

In [None]:
# Show the class distribution
Counter(classes)

In [None]:
# Create data and a validation set
X_tr, X_te, y_tr, y_te = train_test_split(final_df, classes, stratify=classes, 
                                          random_state=30, test_size=0.15)

In [None]:
# Run our tests
classification_df = run_classification(X_tr, y_tr, classifiers)
# View the results
classification_df.sort_values(by='Accuracy', ascending=False)

In [None]:
# Now, let's run this again with the PCA representation
classification_pca_df = run_classification(PCA(n_components=(estimate_pca.shape[1])).fit_transform(X_tr), 
y_tr, classifiers)
# View the results
classification_pca_df.sort_values(by='Accuracy', ascending=False)

In [None]:
# Now, let's try some sampling techniques to get us even better
tomek_link_sampler = TomekLinks(sampling_strategy='majority')
adasyn_sampler = ADASYN(sampling_strategy='not majority', random_state=30, n_neighbors=3)
svm_sampler = SVMSMOTE(sampling_strategy='not majority', random_state=30, k_neighbors=3)

In [None]:
# First, remove TomekLinks
tl_data, tl_classes = tomek_link_sampler.fit_resample(X_tr, y_tr)

# Then, create two oversampled representations of the data
ada_data, ada_classes = adasyn_sampler.fit_resample(tl_data, tl_classes)
svm_data, svm_classes = svm_sampler.fit_resample(tl_data, tl_classes)

In [None]:
# Test ADASYN Algorithm
ada_df = run_imbalanced_classification(ada_data, X_te, ada_classes, y_te, classifiers)
# View the results
ada_df.sort_values(by='Accuracy', ascending=False)

In [None]:
# Test SVMSMOTE Algorithm
svm_df = run_imbalanced_classification(svm_data, X_te, svm_classes, y_te, classifiers)
# View the results
svm_df.sort_values(by='Accuracy', ascending=False)

In [None]:
# Let's combine all of the different representations, and figure out who performed the "best"

# Add data representation 
classification_df['data'] = 'normal'
classification_pca_df['data'] = 'pca'
ada_df['data'] = 'ADASYN'
svm_df['data'] = 'SVMSMOTE'

# Combine all into one dataframe
combined_results_df = pd.concat([classification_df, classification_pca_df, ada_df, svm_df])

In [None]:
combined_results_df['Score'] = combined_results_df[['Accuracy', 'F1']].mean(axis=1)
combined_results_df.sort_values(by='Score', ascending=False)

In [None]:
# Now that we know the "best" models to use, let's train them officially
mlp = MLPClassifier().fit(MinMaxScaler().fit_transform(svm_data), svm_classes)
lgb = LGBMClassifier().fit(svm_data, svm_classes)
rfc = RandomForestClassifier().fit(svm_data, svm_classes)
lrc = LogisticRegression().fit(StandardScaler().fit_transform(svm_data), svm_classes)

In [None]:
# Get shap for LightGBM
explainer = shap.TreeExplainer(lgb)
shap_vals = explainer.shap_values(shap.sample(X_te, 100))
shap.summary_plot(shap_vals, max_display=15, feature_names=X_tr.columns, class_names=lgb.classes_)

In [None]:
# Get shap for RandomForest
explainer = shap.TreeExplainer(rfc, shap.sample(svm_data, 100))
shap_vals = explainer.shap_values(shap.sample(X_te, 100))
shap.summary_plot(shap_vals, max_display=15, feature_names=X_tr.columns, class_names=rfc.classes_)
gc.collect()

In [None]:
# Get shap for LogisticRegression
explainer = shap.LinearExplainer(lrc, shap.sample(svm_data, 100))
shap_vals = explainer.shap_values(shap.sample(X_te, 100))
shap.summary_plot(shap_vals, max_display=15, feature_names=X_tr.columns, class_names=lrc.classes_)

In [None]:
# Get shap for MLP
explainer = shap.KernelExplainer(mlp.predict_proba, shap.sample(svm_data, 50))
shap_vals = explainer.shap_values(shap.sample(X_te, 50))
shap.summary_plot(shap_vals, max_display=15, feature_names=X_tr.columns, class_names=mlp.classes_)

In [None]:
# Hotfix for an incompatability issue with scikit-optimize and scikit-learn
# Documented https://github.com/scikit-optimize/scikit-optimize/pull/988
class FixedBayesSearchCV(BayesSearchCV):
    def __init__(self, estimator, search_spaces, optimizer_kwargs=None,
                n_iter=50, scoring=None, fit_params=None, n_jobs=1,
                n_points=1, refit=True, cv=None, verbose=0,
                pre_dispatch='2*n_jobs', random_state=None,
                error_score='raise', return_train_score=False):

        # Bug fix: Added this line
        self.fit_params = fit_params

        self.search_spaces = search_spaces
        self.n_iter = n_iter
        self.n_points = n_points
        self.random_state = random_state
        self.optimizer_kwargs = optimizer_kwargs
        self._check_search_space(self.search_spaces)

        # Removed the passing of fit_params to the parent class.
        super(BayesSearchCV, self).__init__(
                estimator=estimator, scoring=scoring, n_jobs=n_jobs,
                refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch,
                error_score=error_score, return_train_score=return_train_score)

    def _run_search(self, x):
        raise BaseException('Use newer skopt')
        
# Let's optimize some hyperparameters

# This should be done with a better search and across all models, but setting it up just for one right now
param_grid = {
    'penalty': Categorical(['l1', 'l2', 'elasticnet', 'none']),
    'fit_intercept': Categorical([True, False]),
    'C': Real(1, 10),
    'tol': Real(1e-6, 1e+1),
    'multi_class': Categorical(['auto', 'ovr', 'multinomial']),
    #'solver': Categorical(['lbfgs', 'sag', 'saga'])
}

optimizer = FixedBayesSearchCV(estimator=LogisticRegression(), 
                          search_spaces=param_grid, 
                          n_jobs=2, cv=3, verbose=1)

# Optimize the model
optimizer.fit(svm_data, svm_classes)