In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
import matplotlib.pyplot as plt
import pickle
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sn
#from sklearn import datasets
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import cv2
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint
from scipy.stats import norm
from openml import tasks, flows, runs, datasets, config
import random

task_id = 9914
task = tasks.get_task(task_id)
dataset = datasets.get_dataset(task.dataset_id)
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="dataframe", target=dataset.default_target_attribute
)

In [3]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [6]:
model_params = {
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
                'n_estimators': randint(10,100),
                "max_features": randint(1,64),
                'max_depth': [randint(5,50), None],
                "min_samples_split": randint(2,11),
                "min_samples_leaf": randint(1,11),
                "criterion":['gini','entropy'],
                "bootstrap": [True, False],
        }
    },
    'svm': {
        'model': svm.SVC(gamma='auto', C=1),
        'params': {
            'C': [0.1,1, 10, 100],
            # 'kernel': ['rbf', 'poly', 'sigmoid'],
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(),
        'params': {
            'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
            'C' : np.logspace(-4, 4, 20),
            'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
            'max_iter' : [100, 1000,2500, 5000],

        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            "max_depth": [2, 3, 4, 5, 6, 7, 8 , 9, None],
            "max_features": [2, 3, 4, 5, 6, 7, 8 , 9],
            "min_samples_leaf": [2, 3, 4, 5, 6, 7, 8 , 9, None],
            "criterion": ["gini", "entropy"],
            "splitter": ["best", "random"]
        }
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11, 1e-12, 1e-13, 1e-14, 1e-15]
        }
    },
    'MultinomialNB': {
        'model': MultinomialNB(),
        'params': {
            'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],
        }
    },
    'BernoulliNB': {
        'model': BernoulliNB(),
        'params': {
            'alpha':np.linspace(0.1,1,10)
        }
    },
}

score = []

for model_name, mp in model_params.items():
    clf =  RandomizedSearchCV(mp['model'], mp['params'], n_iter=50, cv=3,scoring='accuracy')
    clf.fit(X, y)
    score.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(score,columns=['model','best_score','best_params'])
df

Traceback (most recent call last):
  File "D:\program files\python\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\program files\python\lib\site-packages\sklearn\ensemble\_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "D:\program files\python\lib\site-packages\joblib\parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "D:\program files\python\lib\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "D:\program files\python\lib\site-packages\joblib\parallel.py", line 777, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "D:\program files\python\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "D:\program files\python\lib\site-packages\joblib\_parallel_backends.py",

Unnamed: 0,model,best_score,best_params
0,random_forest,0.980574,"{'bootstrap': False, 'criterion': 'gini', 'max..."
1,svm,0.946477,{'C': 1}
2,logistic_regression,0.957223,"{'solver': 'liblinear', 'penalty': 'l1', 'max_..."
3,decision_tree,0.981401,"{'splitter': 'best', 'min_samples_leaf': 8, 'm..."
4,GaussianNB,0.946063,{'var_smoothing': 0.01}
5,MultinomialNB,0.709857,{'alpha': 0.01}
6,BernoulliNB,0.946063,{'alpha': 0.1}


In [None]:
df.to_csv("data.csv")

In [None]:
model = RandomForestClassifier(bootstrap=True, criterion='entropy', max_depth= None, max_features= 5, min_samples_leaf= 2, min_samples_split=6, n_estimators=48)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9865702479338843

In [None]:
# model = DecisionTreeClassifier(splitter="best", min_samples_leaf=4, max_features= 5, max_depth=9, criterion='entropy')
# model.fit(x_train, y_train)
# model.score(x_test, y_test)

0.9855371900826446

In [None]:
run = runs.run_model_on_task(model, task)
run.publish()
print(f'View the run online: {run.openml_url}')



View the run online: https://www.openml.org/r/10560801
