In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install sklearn-neuro-evolution

In [None]:
from neuro_evolution import NEATClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
import pickle

In [None]:
with open("../input/ngrams-classification/all_data.pkl","rb") as f:
    data=pickle.load(f)

In [None]:
with open("../input/ngrams-classification/tfidf_targets.pkl","rb") as f:
    targets=pickle.load(f)

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(data,targets,random_state=42,test_size=0.2,stratify=targets)

In [None]:
clf = NEATClassifier(number_of_generations=50,
                     fitness_threshold=0.90,
                     pop_size=50)

In [None]:
neat_genome = clf.fit(X_train.toarray(), Y_train)

In [None]:
import optuna

In [None]:
from sklearn.model_selection import cross_val_score
def objective(trial):
    params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
            'max_depth': trial.suggest_int('max_depth', 4, 50)
        }
    clf = RandomForestClassifier(random_state=42,**params)
    return cross_val_score(clf,X_train,Y_train,n_jobs=-1,cv=4).mean()

In [None]:
study=optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=100)

In [None]:
trial=study.best_trial
for key, value in trial.params.items():
    print('\t\t{}: {}'.format(key, value))

In [None]:
clf = RandomForestClassifier(random_state=42,**trial.params)

In [None]:
clf.fit(X_train,Y_train)

In [None]:
from sklearn.metrics import confusion_matrix
cf=confusion_matrix(Y_test,clf.predict(X_test),normalize="true")
import matplotlib.pyplot as plt
import seaborn as sns
fig,ax=plt.subplots(figsize=(10,10))
sns.heatmap(cf, annot=True,cmap="Blues",linewidth=1,linecolor="black",square=True,vmax=1.0)
ax.set_xticklabels(["BHO","CeeInject","FakeRean","OnLineGames","Renos","Vobfus","Winwebsec"])
ax.set_yticklabels(["BHO","CeeInject","FakeRean","OnLineGames","Renos","Vobfus","Winwebsec"])
plt.xticks(rotation=45)

In [None]:
clf.score(X_test,Y_test)

**For XGBoost classifier**

In [None]:
import xgboost as xgb

In [None]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 0, 1000)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 20)
    #learning_rate = trial.suggest_discrete_uniform('learning_rate', 0.01, 0.1, 0.01)
    scale_pos_weight = trial.suggest_int('scale_pos_weight', 1, 100)
    subsample = trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1)
    colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1)

    xgboost = xgb.XGBClassifier(
            random_state=42, 
            tree_method='gpu_hist',
            n_estimators = n_estimators,
            max_depth = max_depth,
            min_child_weight = min_child_weight,
            #learning_rate = learning_rate,
            scale_pos_weight = scale_pos_weight,
            subsample = subsample,
            colsample_bytree = colsample_bytree,
        )
    return cross_val_score(xgboost,X_train,Y_train,n_jobs=-1,cv=4).mean()
    

In [None]:
study=optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=100)

In [None]:
from sklearn.svm import SVC

In [None]:
def objective(trial):
    svc_c = trial.suggest_loguniform('svc_c', 1e-10, 1e10)
    classifier_obj = SVC(C=svc_c,kernel="rbf",gamma="scale")
    return cross_val_score(classifier_obj,X_train,Y_train,n_jobs=-1,cv=4).mean()

In [None]:
study=optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=100)

In [None]:
trial=study.best_trial
clf=SVC(C=trial.params["svc_c"],kernel="rbf",gamma="scale")
clf.fit(X_train,Y_train)

In [None]:
cf=confusion_matrix(Y_test,clf.predict(X_test),normalize="true")
fig,ax=plt.subplots(figsize=(10,10))
sns.heatmap(cf, annot=True,cmap="Blues",linewidth=1,linecolor="black",square=True,vmax=1.0)
ax.set_xticklabels(["BHO","CeeInject","FakeRean","OnLineGames","Renos","Vobfus","Winwebsec"])
ax.set_yticklabels(["BHO","CeeInject","FakeRean","OnLineGames","Renos","Vobfus","Winwebsec"])
plt.xticks(rotation=45)