In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_iris
import pandas as pd
from sklearn import datasets
from onepiecepredictor.OnePieceClassifier import OnePieceClassifier
from onepiecepredictor.MultiModelsClassifier import MultiModelsClassifier
from onepiecepredictor.OnePieceRegression import OnePieceRegression
from onepiecepredictor.MultiModelsRegression import MultiModelsRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# A Small package for hyper paramter tuning pipelining and comparing multiple models performance.

### Its a wrapper around sklearn, xgboost, catboost, imblearn packages

## Classification

### Currently Supports 7 models for classification:
   * LOGISTIC -> logistic regression, uses LogisticRegression class from sklearn package.
   * RF -> Random Forest, uses RandomForestClassifier class from sklearn package. 
   * SVM -> Support Vector Machine, uses SVC class from sklearn package.
   * KNN -> K Nearest Neighbours, uses KNeighborsClassifier class from sklearn package.
   * ADABOOST -> Adaptive boosting, uses AdaBoostClassifier class from sklearn package.
   * XGBOOST -> Uses XGBClassifier class from xgboost package.
   * CATBOOST -> Uses CatBoostClassifier from catboost package.

### Pass one of the key words mentioned above for OnePieceClassifier using the model paramter to use respective model.

## Paramters Information for OnePieceClassifier Class

* X -> array-like(supported by Sklearn). If testTrainSplit is passed, this will be split into train and test
* Y -> array-like(supported by Sklearn). If testTrainSplit is passed, this will be split into train and test
* model -> string Currently supported models: LOGISTIC,RF,SVM,KNN,ADABOOST,XGBOOST,CATBOOST
* testX -> array-like(supported by Sklearn), test data. Ignored if testTrainSplit is passed
* testY -> array-like(supported by Sklearn), test data. Ignored if testTrainSplit is passed
* testTrainSplit -> float, ratio passed will be the amount of test data.
* stratify -> bool, used to perform stratified splitting. If passed data will be split based on Y.
* hyperParams -> dictionary, Hyper parameters specific to the model passed. If passed CV is performed.
* performCV -> bool, Used when hyperParams not passed to perform plain CV.
* folds -> int, No of folds to be used for CV.
* applySmote -> bool, To apply smote to oversample the data. Pass only one of applySmote or underSample
* underSample -> bool, To randomly undersample the majority data.
* sampling -> str, Values supported by SMOTE, RandomUnderSampler classes in imblearn library.
* scoring -> str, Evaluation metric. Currently supported values: accuracy,balanced_accuracy,f1,precision,recall,roc_auc. If not passed accuracy is used.
* targetEncodeCols -> List. List of columns to target encode.
* modelParams -> dictionary, Any model specific parameters can be passed as dictionary.
* multiClass -> Pass true in case of multiclass classification.


## Methods in OnePieceClassifier class

* fit() -> For training.
* predict() -> For Predicting. Returns score and predictions.
* newDataPredict(testData) -> For getting the predictions on completely new data. Returns new predictions.

## Classification HyperParamters With corss Validation and startified splitting.

In [2]:
hyperParams = {
        'gamma': [0.25, 1],
        'max_depth': [3, 4]
        }

In [3]:
data = load_breast_cancer()
X = data.data
Y = data.target

In [4]:
op = OnePieceClassifier(X, Y, "XGBOOST",testTrainSplit = 0.3, 
                        stratify = True, hyperParams = hyperParams)

Return after splitting not smote or undersample


In [5]:
op.fit()

Cross Validation Grid Search Scores
0.9547784810126583 {'gamma': 0.25, 'max_depth': 3}
0.9547468354430381 {'gamma': 0.25, 'max_depth': 4}
0.9572784810126583 {'gamma': 1, 'max_depth': 3}
0.9472151898734177 {'gamma': 1, 'max_depth': 4}


In [6]:
score, preds = op.predict()

XGBOOST accuracy 0.9824561403508771


In [7]:
score

0.9824561403508771

In [8]:
preds

array([1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1])

### Explicitly pass train and test data sets.

In [9]:
data = load_breast_cancer()
X = data.data
Y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y,test_size=0.3, random_state = 7)
op = OnePieceClassifier(X_train, y_train, "XGBOOST", testX = X_test, testY = y_test
                         ,hyperParams = hyperParams)
op.fit()
score, preds = op.predict()

Direct return
Cross Validation Grid Search Scores
0.9547784810126583 {'gamma': 0.25, 'max_depth': 3}
0.9547468354430381 {'gamma': 0.25, 'max_depth': 4}
0.9572784810126583 {'gamma': 1, 'max_depth': 3}
0.9472151898734177 {'gamma': 1, 'max_depth': 4}
XGBOOST accuracy 0.9824561403508771


### To use  any model with specific model paramters explicitly, pass a dictionary using modelParams parameter.
### For example to use  random forest with 'criterion' as entropy instead of default gini.

In [10]:
data = load_breast_cancer()
X = data.data
Y = data.target

In [11]:
modelParams = {'criterion' : 'entropy'}

In [12]:
hyperParams = {
        'n_estimators': [100, 200],
        'max_depth': [2, 3]
}

In [13]:
op = OnePieceClassifier(X, Y, "RF",testTrainSplit = 0.2, 
                        stratify = True, hyperParams = hyperParams, scoring = 'f1',modelParams = modelParams)

Return after splitting not smote or undersample


In [14]:
op.fit()

Cross Validation Grid Search Scores
0.9584491475733954 {'max_depth': 2, 'n_estimators': 100}
0.9568034403850707 {'max_depth': 2, 'n_estimators': 200}
0.9617859967011716 {'max_depth': 3, 'n_estimators': 100}
0.9617859967011716 {'max_depth': 3, 'n_estimators': 200}


In [15]:
op.predict()

RF f1 0.9583333333333334


(0.9583333333333334,
 array([0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
        0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
        1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
        1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1,
        0, 1, 1, 1]))

## To compare performance of multiple classification models with cross validation

In [16]:
mc = MultiModelsClassifier(X, Y, testTrainSplit = 0.3, 
                        stratify = True, scoring = 'accuracy', performCV = True)

In [17]:
results = mc.predict()

Return after splitting not smote or undersample
Direct return
Plain Cross Validation Scores
[0.9375     0.9375     0.95       0.94936709 0.92405063]
LOGISTIC accuracy 0.9532163742690059
Direct return


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Plain Cross Validation Scores
[0.9875     0.9625     0.95       0.94936709 0.97468354]
RF accuracy 0.9649122807017544
Direct return
Plain Cross Validation Scores
[0.9125     0.9        0.8875     0.88607595 0.91139241]
SVM accuracy 0.935672514619883
Direct return
Plain Cross Validation Scores
[0.8875     0.95       0.9        0.92405063 0.93670886]
KNN accuracy 0.9473684210526315
Direct return
Plain Cross Validation Scores
[0.9625     0.9375     0.9375     0.91139241 0.97468354]
ADABOOST accuracy 0.9415204678362573
Direct return
Plain Cross Validation Scores
[0.95       0.9375     0.975      0.93670886 0.97468354]
XGBOOST accuracy 0.9707602339181286
Direct return
Plain Cross Validation Scores
[0.975      0.9625     0.9625     0.97468354 0.98734177]
CATBOOST accuracy 0.9766081871345029


In [18]:
print(results)

{'LOGISTIC': 0.9532163742690059, 'RF': 0.9649122807017544, 'SVM': 0.935672514619883, 'KNN': 0.9473684210526315, 'ADABOOST': 0.9415204678362573, 'XGBOOST': 0.9707602339181286, 'CATBOOST': 0.9766081871345029}


### For Imbalanced data, with oversampling. Oversample minorty class to 40% from 10% and test on new data


In [19]:
X, Y = make_classification(n_classes=2, class_sep=2,
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)

In [20]:
op = OnePieceClassifier(X, Y, "LOGISTIC",testTrainSplit = 0.2, applySmote = True, sampling = 0.4,
                        stratify = True, scoring = 'f1')

Return after smote


In [21]:
op.fit()

In [22]:
op.predict()

LOGISTIC f1 0.994475138121547


(0.994475138121547,
 array([0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]))

### Predict on new data

In [23]:
X, Y = make_classification(n_classes=2, class_sep=2,
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)

In [24]:
preds = op.newDataPredict(X)

In [25]:
preds

## Regression

### Currently Supports 7 models for classification:
   * LINEAR -> linear regression, uses LinearRegression class from sklearn package.
   * RF -> Random Forest, uses RandomForestRegressor class from sklearn package. 
   * SVM -> Support Vector Machine, uses SVR class from sklearn package.
   * KNN -> K Nearest Neighbours, uses KNeighborsRegressor class from sklearn package.
   * ADABOOST -> Adaptive boosting, uses AdaBoostRegressor class from sklearn package.
   * XGBOOST -> Uses XGBRegressor class from xgboost package.
   * CATBOOST -> Uses CatBoostRegressor from catboost package.

### Paramters Information for OnePieceRegression Class

* X -> array-like(supported by Sklearn). If testTrainSplit is passed, this will be split into train and test
* Y -> array-like(supported by Sklearn). If testTrainSplit is passed, this will be split into train and test
* model -> string Currently supported models: LOGISTIC,RF,SVM,KNN,ADABOOST,XGBOOST,CATBOOST
* testX -> array-like(supported by Sklearn), test data. Ignored if testTrainSplit is passed
* testY -> array-like(supported by Sklearn), test data. Ignored if testTrainSplit is passed
* testTrainSplit -> float, ratio passed will be the amount of test data.
* hyperParams -> dictionary, Hyper parameters specific to the model passed. If passed CV is performed.
* performCV -> bool, Used when hyperParams not passed to perform plain CV.
* folds -> int, No of folds to be used for CV.
* scoring -> str, Evaluation metric. Currently supported values: r2,neg_mean_squared_error. If not passed r2 is used.
* targetEncodeCols -> List. List of columns to target encode.
* modelParams -> dictionary, Any model specific parameters can be passed as dictionary.

## Methods in OnePieceRegression class

* fit() -> For training.
* predict() -> For Predicting. Returns score and predictions.
* newDataPredict(testData) -> For getting the predictions on completely new data. Returns new predictions.

## Regression with corss Validation.

In [26]:
data = datasets.load_boston()
X = data.data
Y = data.target

In [27]:
oreg = OnePieceRegression(X, Y, "SVM", testTrainSplit = 0.1, performCV = True, folds = 3)

In [28]:
oreg.fit()

Plain Cross Validation Scores
[0.15438665 0.10650229 0.26598772]


In [29]:
score, preds = oreg.predict()

SVM r2 0.34925488510037106


In [30]:
score

0.34925488510037106

In [31]:
preds

array([15.54673206, 24.0901247 , 23.28084315, 22.62159566, 13.16347039,
       15.40087197, 15.4833122 , 22.44323748, 23.76091211, 13.37660897,
       15.08671352, 22.13407802, 15.42715015, 20.36897878, 25.20474867,
       15.77979286, 22.97590132, 23.36234758, 20.38150564, 23.27257591,
       15.50425217, 22.75466495, 23.60611513, 22.13589269, 19.7066852 ,
       13.26357132, 24.37972782, 22.75251023, 23.59303909, 23.99226643,
       15.18731668, 22.66819372, 23.33501333, 13.38457154, 23.86017145,
       19.17809459, 20.65670891, 13.30587608, 22.68438372, 23.2121177 ,
       23.22551318, 15.14903014, 19.01176745, 20.84939127, 15.66299515,
       13.27407679, 22.28021024, 15.67954225, 16.92536172, 13.50496673,
       23.12350002])

## To compare performance of multiple classification models with cross validation

In [32]:
data = datasets.load_boston()
X = data.data
Y = data.target

In [33]:
mr = MultiModelsRegression(X, Y, testTrainSplit = 0.1,performCV = True)

In [34]:
results = mr.predict()

Plain Cross Validation Scores
[0.60751038 0.70491158 0.65270854 0.74562459 0.80643389]
LINEAR r2 0.7411608113128136
Plain Cross Validation Scores
[0.77029406 0.88816969 0.8136425  0.89774958 0.91415022]
RF r2 0.8984647766384302
Plain Cross Validation Scores
[0.11621987 0.11101511 0.05166818 0.24898521 0.2687132 ]
SVM r2 0.34925488510037106
Plain Cross Validation Scores
[0.4230675  0.64771465 0.14544463 0.63244563 0.63013816]
KNN r2 0.6056016145431431
Plain Cross Validation Scores
[0.74122869 0.81921018 0.83943351 0.87509177 0.86914851]
ADABOOST r2 0.8872523509716893
Plain Cross Validation Scores
[0.82610214 0.86961755 0.82893475 0.89717366 0.88435955]
XGBOOST r2 0.8985230224379863
Plain Cross Validation Scores
[0.83943697 0.88791814 0.85577629 0.92789034 0.90279473]
CATBOOST r2 0.9339575147177712


In [35]:
results

{'LINEAR': 0.7411608113128136,
 'RF': 0.8984647766384302,
 'SVM': 0.34925488510037106,
 'KNN': 0.6056016145431431,
 'ADABOOST': 0.8872523509716893,
 'XGBOOST': 0.8985230224379863,
 'CATBOOST': 0.9339575147177712}