In [65]:
import csv
import pickle
import warnings
from typing import Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.base import ClassifierMixin, RegressorMixin
from sklearn.calibration import CalibratedClassifierCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.datasets import make_classification
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              ExtraTreesClassifier, GradientBoostingClassifier,
                              HistGradientBoostingClassifier,
                              RandomForestClassifier)
from sklearn.exceptions import ConvergenceWarning
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import (ARDRegression, BayesianRidge, HuberRegressor,
                                  LinearRegression, LogisticRegression,
                                  PassiveAggressiveClassifier,
                                  PoissonRegressor, RidgeClassifier,
                                  RidgeClassifierCV, SGDClassifier,
                                  TheilSenRegressor, TweedieRegressor)
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")

In [58]:
def load_data(filename: str) -> (np.ndarray, np.ndarray):
    df = pd.read_csv(filename, header=None)
    df[0] = df[0].apply(lambda x: int("evil" in x))
    data = df.to_numpy()
    return data[:, 1:], data[:, 0]
def dump_data(filename: str, X: np.ndarray, y: np.ndarray) -> None:
    with open(filename, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        for vec, label in zip(X, y):
            writer.writerow([int(label)] + vec.tolist())
def generate_data(n_samples: int=100, n_features: int=310) -> (np.ndarray, np.ndarray):
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=2, n_redundant=0,
                               n_clusters_per_class=1, random_state=1)
    X += 2 * np.random.RandomState(2).uniform(size=X.shape)
    return X, y
def load_model(filename: str) -> Union[ClassifierMixin, RegressorMixin]:
    with open(filename, "rb") as file:
        model = pickle.load(file)
    return model
def dump_model(model: Union[ClassifierMixin, RegressorMixin], filename: str) -> None:
    with open(filename, "wb") as file:
        pickle.dump(model, file)
def reg2clf_elbow_method(y_pred: np.ndarray) -> np.ndarray:
    indexed = np.array(list(zip(np.arange(len(y_pred)), y_pred)))
    values_sort = indexed[indexed[:,1].argsort()]
    value_diff = np.diff(values_sort[:,1])
    high_pass = values_sort[np.argmax(value_diff)][1]
    return y_pred > high_pass
def prediction_func(model: Union[ClassifierMixin, RegressorMixin], asm_vecs: np.array) -> np.array:
    predicted_results = model.predict(asm_vecs)
    if isinstance(model, RegressorMixin):
        y_pred = reg2clf_elbow_method(predicted_results)
    else:
        y_pred = predicted_results
    return np.argwhere(y_pred==True).flatten()

In [70]:
X, y = load_data("train_data/main_train_2.csv")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
X_external, y_external = load_data("train_data/tmp.csv")

## Classifiers

In [71]:
classifiers = {
    "Calibrated CV": CalibratedClassifierCV(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "Ada Boost": AdaBoostClassifier(),
    "Bagging": BaggingClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Hist Gradient Boosting": HistGradientBoostingClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)),
    "Passive Aggressive": PassiveAggressiveClassifier(),
    "Ridge": RidgeClassifier(),
    "Ridge CV": RidgeClassifierCV(),
    "SGD": SGDClassifier(),
    "Naive Bayes": GaussianNB(),
    "Neural Network": MLPClassifier(alpha=1, max_iter=1000),
    "Linear SVM": SVC(C=0.025, kernel="linear"),
    "RBF SVM": SVC(gamma=2),
    "Decision Tree": DecisionTreeClassifier(max_depth=5),
}

In [73]:
for classifier_name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    y_pred_external = classifier.predict(X_external)
#     print(classifier_name)
#     print(classification_report(y_test, y_pred, target_names=["False", "True"]))
#     print(classification_report(y_external, y_pred_external, target_names=["False", "True"]))
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_external = accuracy_score(y_external, y_pred_external)
    precision = precision_score(y_test, y_pred)
    precision_external = precision_score(y_external, y_pred_external)
    print(y_external)
    print(y_pred_external)
    print(f"{classifier_name:25} - {precision:5f} / {precision_external:5f} - {accuracy:5f} / {accuracy_external:5f}")

  _warn_prf(average, modifier, msg_start, len(result))


[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1.]
Calibrated CV             - 0.000000 / 0.148148 - 0.727273 / 0.148148
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0.]
[1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
 0. 1. 1.]
QDA                       - 0.200000 / 0.000000 - 0.454545 / 0.518519
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0.]
[1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0.
 0. 0. 1.]
Ada Boost                 - 0.333333 / 0.250000 - 0.600000 / 0.703704
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0.]
[1. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1.
 0. 1. 0.]
Bagging                   - 0.636364 / 0.157895 - 0.781818 / 0.370370


  _warn_prf(average, modifier, msg_start, len(result))


[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Extra Trees               - 0.800000 / 0.000000 - 0.836364 / 0.851852
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0.]
Gradient Boosting         - 0.666667 / 1.000000 - 0.800000 / 0.888889
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0.]
[1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 1.
 1. 1. 1.]
Hist Gradient Boosting    - 0.600000 / 0.153846 - 0.781818 / 0.518519
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Random Forest             - 0.500000 / 0.000000 - 0.727273 / 0.814815


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Gaussian Process          - 0.000000 / 0.000000 - 0.727273 / 0.851852
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Passive Aggressive        - 0.000000 / 0.000000 - 0.727273 / 0.851852
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Ridge                     - 0.000000 / 0.000000 - 0.727273 / 0.851852
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Ridge CV                  - 0.000000 / 0.000000 - 0.727273 / 0.851852
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Regressors

In [None]:
regressors = {
    "Transformed Target": TransformedTargetRegressor(),
    "PLSRegression": PLSRegression(),
    "Kernel Ridge": KernelRidge(alpha=1.0),
    "ARD": ARDRegression(),
    "Bayesian Ridge": BayesianRidge(),
    "Huber": HuberRegressor(),
    "Linear": LinearRegression(),
    "Logistic": LogisticRegression(),
    "Poisson": PoissonRegressor(),
    "TheilSen": TheilSenRegressor(),
    "Tweedie": TweedieRegressor(),
}

In [None]:
for regressor_name, regressor in regressors.items():
    regressor.fit(X_train, y_train)
    predicted_results = regressor.predict(X_test)
    y_pred = reg2clf_elbow_method(regressor.predict(X_test))
    print(regressor_name)
    print(classification_report(y_test, y_pred))

In [None]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
predicted_results = regressor.predict(X_test)
y_pred = reg2clf_elbow_method(regressor.predict(X_test))

In [None]:
y_pred

In [None]:
np.argwhere(y_pred==True).flatten()

In [None]:
x = np.arange(len(predicted_results)-1)
plt.title("Matplotlib demo")
plt.xlabel("x axis caption")
plt.ylabel("y axis caption")
plt.plot(x,np.diff(predicted_results))
plt.show()