In [882]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
import matplotlib.pyplot as plt
import pickle
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sn
#from sklearn import datasets
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import cv2
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint
from scipy.stats import norm
from openml import tasks, flows, runs, datasets, config
import random

task_id = 31
task = tasks.get_task(task_id)
dataset = datasets.get_dataset(task.dataset_id)
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="dataframe", target=dataset.default_target_attribute
)

In [883]:
def fun1(val):
    if val == '<0':
        return "checking_acc_less_than_0"
    elif val == '0<=X<200':
        return "checking_acc_bt_0_and_200"
    elif val == '>=200':
        return "checking_acc_greater_than_200"
    else:
        return "no_checking_acc"


def fun2(val):
    if val == 'critical/other existing credit':
        return "critical_or_existing_credit_history"
    elif val == 'existing paid':
        return "existing_paid_credit_history"
    elif val == 'delayed previously':
        return "delayed_previously_credit_history"
    elif val == "no credits/all paid":
        return "no_credit_or_all_paid_credit_history"
    else:
        return "all_paid_credit_history"


def fun3(val):
    if val == 'no known savings':
        return "no_saving_account"
    elif val == '<100':
        return "less_than_100_saving_account"
    elif val == '500<=X<1000':
        return "bt_500_and_1000_saving_account"
    elif val == ">=1000":
        return "greater_than_1000_saving_account"
    else:
        return "bt_100_and_500_saving_account"


def fun4(val):
    if val == '>=7':
        return "employed_more_than_7"
    elif val == '1<=X<4':
        return "employed_bt_1_and_4"
    elif val == '4<=X<7':
        return "employed_bt_4_and_7"
    elif val == "unemployed":
        return "not_employed"
    else:
        return "employed_less_than_1"


def train_data_clean(X, y):
    # cleaning checking status
    df = pd.concat([X, y], axis="columns")

    df.checking_status = df.checking_status.apply(lambda x: fun1(x))
    dummies = pd.get_dummies(df.checking_status)
    df = pd.concat([df, dummies], axis="columns")
    df = df.drop(["checking_status", "no_checking_acc"], axis="columns")

    # cleaning credit history
    df.credit_history = df.credit_history.apply(lambda df: fun2(df))
    dummies = pd.get_dummies(df.credit_history)
    df = pd.concat([df, dummies], axis="columns")
    df = df.drop(
        ["credit_history", "no_credit_or_all_paid_credit_history"], axis="columns")

    # purpose
    dummies = pd.get_dummies(df.purpose)
    df = pd.concat([df, dummies], axis="columns")
    df = df.drop(["purpose", "other"], axis="columns")

    # savings status
    df.savings_status = df.savings_status.apply(lambda x: fun3(x))
    dummies = pd.get_dummies(df.savings_status)
    df = pd.concat([df, dummies], axis="columns")
    df = df.drop(["savings_status", "no_saving_account"], axis="columns")

    # employment
    df.employment = df.employment.apply(lambda x: fun4(x))
    dummies = pd.get_dummies(df.employment)
    df = pd.concat([df, dummies], axis="columns")
    df = df.drop(["employment", "not_employed"], axis="columns")

    # personal_status
    dummies = pd.get_dummies(df.personal_status)
    df = pd.concat([df, dummies], axis="columns")
    df = df.drop(["personal_status", "female div/dep/mar"], axis="columns")

    # other_parties
    dummies = pd.get_dummies(df.other_parties)
    df = pd.concat([df, dummies], axis="columns")
    df = df.drop(["other_parties", "none"], axis="columns")

    # property_magnitude
    dummies = pd.get_dummies(df.property_magnitude)
    df = pd.concat([df, dummies], axis="columns")
    df = df.drop(["property_magnitude", "car"], axis="columns")

    # other_payment_plans
    dummies = pd.get_dummies(df.other_payment_plans)
    df = pd.concat([df, dummies], axis="columns")
    df = df.drop(["other_payment_plans", "none"], axis="columns")

    # housing
    dummies = pd.get_dummies(df.housing)
    df = pd.concat([df, dummies], axis="columns")
    df = df.drop(["housing", "for free"], axis="columns")

    # job
    dummies = pd.get_dummies(df.job)
    df = pd.concat([df, dummies], axis="columns")
    df = df.drop(["job", "unemp/unskilled non res"], axis="columns")

    # own_telephone
    df.own_telephone = df.own_telephone.apply(
        lambda x: "have_telephone" if x == "yes" else "no_telephone")
    dummies = pd.get_dummies(df.own_telephone)
    df = pd.concat([df, dummies], axis="columns")
    df = df.drop(["own_telephone", "no_telephone"], axis="columns")

    # foreign_worker
    df.foreign_worker = df.foreign_worker.apply(
        lambda x: 1 if x == "yes" else 0)

    return df

In [884]:
X = train_data_clean(X,y)

In [885]:
lst = ["credit_amount", "duration"]

for col in lst:
       mean = X[col].mean()
       std = X[col].std()
       X = X[(X[col] > mean - std) & (X[col] < mean + std)]


In [886]:
y = X["class"]
X = X.drop(["class"], axis="columns")

In [887]:
# model_params = {
#     'random_forest': {
#         'model': RandomForestClassifier(),
#         'params': {
#                 'n_estimators': randint(10,100),
#                 "max_features": randint(1,64),
#                 'max_depth': [randint(5,50), None],
#                 "min_samples_split": randint(2,11),
#                 "min_samples_leaf": randint(1,11),
#                 "criterion":['gini','entropy'],
#                 "bootstrap": [True, False],
#         }
#     },
#     'svm': {
#         'model': svm.SVC(gamma='auto', C=1),
#         'params': {
#             'C': [0.1,1, 10, 100],
#             # 'kernel': ['rbf', 'poly', 'sigmoid'],
#         }
#     },
#     'logistic_regression': {
#         'model': LogisticRegression(),
#         'params': {
#             'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
#             'C' : np.logspace(-4, 4, 20),
#             'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
#             'max_iter' : [100, 1000,2500, 5000],

#         }
#     },
#     'decision_tree': {
#         'model': DecisionTreeClassifier(),
#         'params': {
#             "max_depth": [2, 3, 4, 5, 6, 7, 8 , 9, None],
#             "max_features": [2, 3, 4, 5, 6, 7, 8 , 9],
#             "min_samples_leaf": [2, 3, 4, 5, 6, 7, 8 , 9, None],
#             "criterion": ["gini", "entropy"],
#             "splitter": ["best", "random"]
#         }
#     },
#     'GaussianNB': {
#         'model': GaussianNB(),
#         'params': {
#             'var_smoothing': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11, 1e-12, 1e-13, 1e-14, 1e-15]
#         }
#     },
#     'MultinomialNB': {
#         'model': MultinomialNB(),
#         'params': {
#             'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],
#         }
#     },
#     'BernoulliNB': {
#         'model': BernoulliNB(),
#         'params': {
#             'alpha':np.linspace(0.1,1,10)
#         }
#     },
# }

# score = []

# for model_name, mp in model_params.items():
#     clf =  RandomizedSearchCV(mp['model'], mp['params'], n_iter=100, cv=3,scoring='accuracy')
#     clf.fit(X, y)
#     score.append({
#         'model': model_name,
#         'best_score': clf.best_score_,
#         'best_params': clf.best_params_
#     })
    
# df = pd.DataFrame(score,columns=['model','best_score','best_params'])
# df
# df.to_csv("data.csv")

In [888]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [903]:
model = LogisticRegression(solver= 'liblinear', penalty='l2', max_iter=5000, C= 4.2813323987193)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.772020725388601

<code object get_dataset at 0x000001E9CC6CFEA0, file "D:\program files\python\lib\site-packages\openml\tasks\task.py", line 118>

In [906]:
run.data_content
run = runs.run_model_on_task(model, task)
# run.publish()
# print(f'View the run online: {run.openml_url}')

NameError: name 'run' is not defined