In [1]:
# compine all steps in Piplines and columntransformer

# Import pandas
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
# Import train_test_split
from sklearn.model_selection import train_test_split
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer

from sklearn.model_selection import cross_val_score

import pickle

In [2]:
# Load dataset
df = pd.read_csv('datasets/cc_approvals.data',header=None)

In [3]:

print(df[3].value_counts())


u    519
y    163
?      6
l      2
Name: 3, dtype: int64


In [4]:
X = df.drop([15] , axis=1)

# labels
values = {'+' : 1, '-':0}
y = df[15].map(values)

In [5]:
# function transformer
# to replace '?' with np.nan

def _replace_to_nan_drop(X):

    X = X.replace(['?'],np.nan)

    X.columns = X.columns.astype(str)

    # drop unimportant columns
    X = X.drop(['11', '13'], axis=1)

    # change a specific column data type
    X['1'] = X['1'].astype(float)
   
    return X

replace_to_nan_drop = FunctionTransformer(_replace_to_nan_drop)




# columns to imputers
def get_columns_dtypes(X):
    '''
    the dtypes of the columns were be changed after the fill nulls step
    '''
    X = _replace_to_nan_drop(X)
    cats = X.select_dtypes(include=['object']).columns
    cons = X.select_dtypes(exclude=['object']).columns
    return {'cats': cats, 'cons': cons}




# label encodeing
def _label_encoding(X):

    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    for col in X.columns:
        # Use LabelEncoder to do the numeric transformation
            X[col]=LabelEncoder().fit_transform(X[col])
    return X

label_encoding = FunctionTransformer(_label_encoding)


In [6]:
pipe_numeric = Pipeline([("numeric_null", SimpleImputer(missing_values=np.nan, strategy='mean')),
                  ("scaler", MinMaxScaler(feature_range=(0, 1)))])

pipe_cat = Pipeline([("cats_null", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                  ("encode", label_encoding)])

preprocessing = ColumnTransformer(
    [("numeric",pipe_numeric ,get_columns_dtypes(X)['cons']),
    ("cats",pipe_cat ,get_columns_dtypes(X)['cats'])])

pipe = Pipeline([('replace_to_nan', replace_to_nan_drop),
                  ('preprocessing', preprocessing),
                 ('classifir',LogisticRegression(max_iter= 150, tol=0.01))])

In [7]:
# test in cross val score 
print(cross_val_score(pipe, X,y, cv=2))

[0.8115942  0.83478261]


In [8]:
# in all data


pipe.fit(X,y)

In [9]:
with open("model/model.pkl", "wb") as file:
    pickle.dump(pipe, file)