In [1]:
import warnings
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import make_classification
from sklearn.exceptions import ConvergenceWarning
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              ExtraTreesClassifier, GradientBoostingClassifier,
                              HistGradientBoostingClassifier,
                              RandomForestClassifier)
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import (ARDRegression, BayesianRidge,
                                  HuberRegressor, LinearRegression,
                                  LogisticRegression,
                                  PassiveAggressiveClassifier,
                                  PoissonRegressor, RANSACRegressor,
                                  RidgeClassifier, RidgeClassifierCV,
                                  SGDClassifier, TheilSenRegressor,
                                  TweedieRegressor)
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")

In [5]:
def load_input_from_file(filename: str) -> (np.ndarray, np.ndarray):
    with open(filename) as file:
        data = np.loadtxt(file, delimiter=",")
    return data[:, 1:], data[:, 0]
def generate_exmaple_data(n_samples: int=100, n_features: int=310) -> (np.ndarray, np.ndarray):
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=2, n_redundant=0,
                               n_clusters_per_class=1, random_state=1)
    X += 2 * np.random.RandomState(2).uniform(size=X.shape)
    return X, y
def dump_data(filename: str) -> None:
    import csv
    with open(filename, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        for vec, label in zip(X, y):
            writer.writerow([int(label)] + vec.tolist())

In [6]:
X, y = load_input_from_file("example_input.csv")
# X, y = generate_exmaple_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)

## Classifiers

In [7]:
classifiers = {
    "Calibrated CV": CalibratedClassifierCV(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "Ada Boost": AdaBoostClassifier(),
    "Bagging": BaggingClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Hist Gradient Boosting": HistGradientBoostingClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)),
    "Passive Aggressive": PassiveAggressiveClassifier(),
    "Ridge": RidgeClassifier(),
    "Ridge CV": RidgeClassifierCV(),
    "SGD": SGDClassifier(),
    "Naive Bayes": GaussianNB(),
    "Neural Network": MLPClassifier(alpha=1, max_iter=1000),
    "Linear SVM": SVC(kernel="linear", C=0.025),
    "RBF SVM": SVC(gamma=2, C=1),
    "Decision Tree": DecisionTreeClassifier(max_depth=5),
}

In [8]:
for classifier_name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
#     predicted_results = classifier.predict(X_test)
#     print(f"{classifier_name:20}: {predicted_results}")
    score = classifier.score(X_test, y_test)
    print(f"{classifier_name:20}: {score}")

Calibrated CV       : 0.575
QDA                 : 0.45
Ada Boost           : 0.95
Bagging             : 0.95
Extra Trees         : 0.625




Gradient Boosting   : 0.95
Hist Gradient Boosting: 0.95
Random Forest       : 0.45
Gaussian Process    : 0.425
Passive Aggressive  : 0.525
Ridge               : 0.575
Ridge CV            : 0.575
SGD                 : 0.625
Naive Bayes         : 0.7
Neural Network      : 0.55
Linear SVM          : 0.6
RBF SVM             : 0.4
Decision Tree       : 0.95


## Regressors

In [19]:
regressors = {
    "Transformed Target": TransformedTargetRegressor(),
    "PLSRegression": PLSRegression(),
    "Kernel Ridge": KernelRidge(alpha=1.0),
    "ARD": ARDRegression(),
    "Bayesian Ridge": BayesianRidge(),
    "Huber": HuberRegressor(),
    "Linear": LinearRegression(),
    "Logistic": LogisticRegression(),
    "Poisson": PoissonRegressor(),
#     "RANSAC": RANSACRegressor(min_samples=50), # FIXME
    "TheilSen": TheilSenRegressor(),
    "Tweedie": TweedieRegressor(),
}

In [20]:
for regressor_name, regressor in regressors.items():
    regressor.fit(X_train, y_train)
    predicted_results = regressor.predict(X_test)
    print(f"{regressor_name:20}: {predicted_results}")
#     score = regressor.score(X_test, y_test)
#     print(f"{regressor_name:20}: {score}")

Transformed Target  : [0.66341425 0.23934944 0.53570682 0.33110733 0.48103468 0.19715645
 0.37145135 0.26449344 0.55999104 0.36981599 0.61666291 0.63171995
 0.30866133 0.62479472 0.75822589 0.73912439 0.10417565 0.20577869
 0.33516355 0.82104244 0.41407807 0.31270604 0.25629228 0.35174011
 0.63876056 0.70738042 0.29441251 0.39999353 0.04847399 0.19296669
 0.12733034 0.38051579 0.70616724 0.29637698 0.09839239 0.14945951
 0.28809482 0.568766   0.40015064 0.51159572]
PLSRegression       : [[0.62473943]
 [0.42918294]
 [0.51673229]
 [0.37733726]
 [0.392208  ]
 [0.23018411]
 [0.3820028 ]
 [0.31438127]
 [0.62985777]
 [0.34795639]
 [0.59210033]
 [0.66591635]
 [0.29782501]
 [0.597517  ]
 [0.72026421]
 [0.67679423]
 [0.17860743]
 [0.1213335 ]
 [0.38156773]
 [0.8164904 ]
 [0.42726643]
 [0.38003412]
 [0.22970572]
 [0.32443729]
 [0.64819297]
 [0.66227135]
 [0.29202046]
 [0.38834094]
 [0.08876148]
 [0.20072091]
 [0.07347027]
 [0.33867624]
 [0.72237872]
 [0.27074723]
 [0.09602695]
 [0.23069897]
 [0.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Huber               : [0.63414541 0.22784261 0.56774659 0.36032646 0.52312506 0.04545756
 0.39081936 0.21334024 0.55767976 0.29991582 0.60777033 0.5958724
 0.20293289 0.59588349 0.80607216 0.77194602 0.09291947 0.19125235
 0.31843801 0.83549852 0.42347381 0.25265431 0.30570387 0.31500156
 0.62197532 0.7519069  0.32382082 0.38351755 0.04657435 0.30936647
 0.16045527 0.39845944 0.69642486 0.29969201 0.13937805 0.03445798
 0.26316481 0.6272997  0.4592655  0.41990269]
Linear              : [0.66341425 0.23934944 0.53570682 0.33110733 0.48103468 0.19715645
 0.37145135 0.26449344 0.55999104 0.36981599 0.61666291 0.63171995
 0.30866133 0.62479472 0.75822589 0.73912439 0.10417565 0.20577869
 0.33516355 0.82104244 0.41407807 0.31270604 0.25629228 0.35174011
 0.63876056 0.70738042 0.29441251 0.39999353 0.04847399 0.19296669
 0.12733034 0.38051579 0.70616724 0.29637698 0.09839239 0.14945951
 0.28809482 0.568766   0.40015064 0.51159572]
Logistic            : [1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0.