## Pipelines and data modeling

In [1]:
import os
  
n_cpu = os.cpu_count()
print("Number of CPUs in the system:", n_cpu)

Number of CPUs in the system: 8


In [2]:
# Libraries

import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.svm import SVC
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import RFE, VarianceThreshold
import pickle
import joblib
# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn import set_config
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from imblearn import FunctionSampler
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans

# Custom Transformers
###############################################################################################################################

# Logger class for debugging

class Logger:
    
    def __init__(self, message):
        self.message = message
        
    def fit(self, X, y):
        print("### Logger (fit): " + self.message)
        return self
    
    def transform(self, X):
        print("### Logger (transform)" + self.message)
        return X

###############################################################################################################################

#  Processing categorical variables (Feature Engineering)

# get_dummies like

class GetDummies(BaseEstimator, TransformerMixin):
    
    def __init__(self, dummy_columns):
        self.columns = None
        self.dummy_columns = dummy_columns

    def fit(self, X, y=None):
        self.columns = pd.get_dummies(X, columns=self.dummy_columns).columns # learned column names
        return self

    def transform(self, X):
        X_new = pd.get_dummies(X, columns=self.dummy_columns)
        return X_new.reindex(columns=self.columns, fill_value=0)
    
    
# -----------------------------------------------------------------------------------------------------------------------------

# for binary problem

class ReplaceColumns(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y):
        
        self.unique_values = {}
        self.n_cat = {}
        
        for col in self.columns:
            self.unique_values[col] = list(X[col].unique())
            self.n_cat[col] = list(np.arange(len(self.unique_values[col])))
        
        return self
    
    def transform(self, X):
        
        for col in self.columns:
            
            X[col] = X[col].replace(self.unique_values[col], self.n_cat[col], inplace = False)
        
        return X
    

# for n_cat > 2    

class OneHotEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y):
        
        self.unique_values = {}
        
        for col in self.columns:
            self.unique_values[col] = list(X[col].unique())
        
        return self
    
    def transform(self, X):
        
        for col in self.columns:
            for cat in self.unique_values[col]:
                X[col + '_' + str(cat)] = np.where(X[col] == cat, 1, 0)
            
            X = X.drop(columns = [col], axis = 1)
        
        return X

# ----------------------------------------------------------------------------------------------------------------------------
# OrdinalEncoder

class OrdinalEncoderColumns(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y):
        
        self.oe = {}
        x = {}
        
        for col in self.columns:
            
            x[col] = X[col].values.reshape(-1, 1)
            self.oe[col] = OrdinalEncoder(categories = 'auto', handle_unknown='use_encoded_value', unknown_value = -1)
            self.oe[col].fit(x[col])
        
        return self
    
    def transform(self, X):
        
        for col in self.columns:
            
            X[col] = self.oe[col].transform(X[col].values.reshape(-1, 1))
        
        return X
    
    
# ----------------------------------------------------------------------------------------------------------------------------

class GetFirstLetter(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y):
        
        return self
    
    def transform(self, X):
        
        for col in self.columns:
            X[col] = X[col].astype(str).str[0]
            
        return X

#-----------------------------------------------------------------------------------------------------------------------------

class String2UniqueFeature(BaseEstimator, TransformerMixin):
    
    def __init__(self, column, str_leng):
        self.column = column
        self.str_leng = str_leng
        
    def fit(self, X, y):    
        return self
    
    def transform(self, X):
        
        for i in range(self.str_leng):
            X[f'ch{i}'] = X[self.column].str.get(i).apply(ord)
        
        X["unique_characters"] = X[self.column].apply(lambda x: len(set(x)))
        
        X = X.drop(columns = [self.column], axis = 1)
        
        return X


###############################################################################################################################
    
# Missing values

class SimpleImputers(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, strategy, category = None):
        self.columns = columns # warning: columns deve essere una lista
        self.strategy = strategy
        self.category = category
    
    
    def fit(self, X, y):
        
        self.new_value = {} # storage learned value for each column
        
        for col in self.columns:
            
            if self.strategy == 'mean':
                self.new_value[col] = X[col].mean()
            
            if self.strategy == 'median':
                self.new_value[col] = X[col].median()
                
            if self.strategy == 'new_category':
                self.new_value[col] = self.category
            
        return self
        
    def transform(self, X):
        
        for col in self.columns:
            X[col] = X[col].fillna(self.new_value[col])
    
        return X


class InvalidValueImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, value = -1, suffix = "_invalid"):
        self.columns = columns
        self.value = value
        self.suffix = suffix
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        
        for col in self.columns:
            X[col + self.suffix] = np.where(X[col].is_null(), 1, 0)
            X[col] = X[col].fillna(self.value)
            
        return X
    
    
class ModelPredictImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator, target_name, feature_names):
        
        self.estimator = estimator
        self.target_name = target_name
        self.feature_names = feature_names
    
    def fit(self, X, y):
    
        bool_target_not_null = X[self.target_name].notnull()

        row_indexes_where_target_notnull = X.index[np.where(bool_target_not_null)]

        X_train = X.loc[row_indexes_where_target_notnull, self.feature_names]
        y_train = X.loc[row_indexes_where_target_notnull, self.target_name]
    
        self.ist = self.estimator.fit(X_train, y_train) 
        
        return self
        
    def transform(self, X):
        
        bool_target_is_null = X[self.target_name].isnull()
        row_indexes_where_target_isnull = X.index[np.where(bool_target_is_null)]
        X_test = X.loc[row_indexes_where_target_isnull, self.feature_names]
        
        try:
            preds = self.ist.predict(X_test)
            X.loc[row_indexes_where_target_isnull, self.target_name] = preds
        
        except:
            X = X
    
        return X


    
###############################################################################################################################   
    
# Normalization
        
class ScalerColumns(BaseEstimator, TransformerMixin):
    
    def __init__(self, scaler): # class inputs: columns list & scaler
        self.scaler = scaler

    def fit(self, X, y):
        
        self.sc = {}
        x = {}
        
        for col in X.columns:
            

            x[col] = X[col].values.reshape(-1, 1) # returns a numpy array

            if self.scaler == 'MinMaxScaler':
                self.sc[col] = MinMaxScaler()
                self.sc[col].fit(x[col])

            if self.scaler == 'StandardScaler':
                self.sc[col] = StandardScaler()
                self.sc[col].fit(x[col])
                
            if self.scaler == 'RobustScaler':
                self.sc[col] = RobustScaler()
                self.sc[col].fit(x[col])

        return self

    def transform(self, X):
        
        for col in X.columns:
            
            X[col] = self.sc[col].transform(X[col].values.reshape(-1, 1))
        
        return X
        
############################################################################################################################### 

# Outlier Imputing

class OutlierColumnsImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y):
        
        self.q1 = {} 
        self.q3 = {} 
        self.med = {}
        self.iqr= {}
        self.upper_bound = {}
        self.lower_bound = {}
        
        for col in self.columns:
            
            self.q1[col] = np.quantile(X[col], 0.25)
            self.q3[col] = np.quantile(X[col], 0.75)
            self.med[col] = np.median(X[col])

            self.iqr[col] = self.q3[col] - self.q1[col]

            self.upper_bound[col] = self.q3[col] + (1.5 * self.iqr[col])
            self.lower_bound[col] = self.q1[col] - (1.5 * self.iqr[col])
        
        return self
            
    def transform(self, X):
        
        new_X = X.copy()
        
        for col in self.columns:
            # capping
            new_X[col] = np.where(new_X[col] > self.upper_bound[col], self.upper_bound[col], new_X[col])
            new_X[col] = np.where(new_X[col] < self.lower_bound[col], self.lower_bound[col], new_X[col])
            
        return new_X


###############################################################################################################################
    
# Feature Selection

class FsCorrMatrix(BaseEstimator, TransformerMixin):
    
    def __init__(self, n, redundancy_threshold):
        self.n = n                                           # n (int) number of features to select
        self.redundancy_threshold = redundancy_threshold
    
    def fit(self, X, y):
        
        df_num = pd.concat([y, X], axis=1)
        corr_matrix_abs = df_num.corr().abs()
        
        # upper_tri = upper triangular correlation matrix of features
        
        upper_tri = corr_matrix_abs.where(np.triu(np.ones(corr_matrix_abs.shape), k=1).astype(bool))
        upper_tri = upper_tri.drop(y.name, axis=0).drop(y.name, axis=1)
        
        # to_drop = list of redundant highly correlated features
        
        to_drop = [col_name for col_name in upper_tri.columns if any(upper_tri[col_name] > self.redundancy_threshold)]
        
        corr_matrix_abs.drop(corr_matrix_abs[to_drop], axis=0, inplace=True) 
        
        correlation_y = corr_matrix_abs[y.name]
        correlation_y_sorted = correlation_y.sort_values(ascending = False)
        fs = correlation_y_sorted[1:(self.n+1)]  # 1 to skip y itself
        self.index = fs.index
        
        return self
    
    def transform(self, X):
        X = X[self.index]
        return X


###############################################################################################################################
# Processing X and y in Pipeline/ remove/add rows -> FunctionSampler from Imblearn

# Outlier rejection -> FunctionSampler in pipeline

def outlier_rejection_with_model(X, y, max_samples='auto', contamination='auto'):
    model = IsolationForest(max_samples=max_samples, contamination=contamination)
    model.fit(X)
    y_pred = model.predict(X)  # 1 if good, -1 if outlier
    X_good = X[y_pred == 1]
    y_good = y[y_pred == 1]
    return X_good, y_good

# Drop NaN values in y target (funzione da usare, nel caso, prima della pipeline)

def drop_y_na(X, y):
    # Not usable, as sklearn consider invalid "y" with NaN
    y_good_index = np.where(pd.notna(y))[0]
    X_good = X[y_good_index]
    y_good = y[y_good_index]
    return X_good, y_good

# Column names: remove white spaces and convert to lower case

def columns_strip_lower(X, y):
    
    X.columns = X.columns.str.strip().str.lower()
    y.name = y.name.strip().lower()
    return X, y

###############################################################################################################################
    
# Data Loading

df_raw = pd.read_csv('train.csv')
df = df_raw.copy()
X = df.drop(['id','target'], axis = 1)
y = df['target']
df.head()

###############################################################################################################################

# Judge with Nested-CrossValidation & pipelines

class Judge():
    
    def __init__(self):
        pass
    
    def set_data(self, X, y):
        self.X = X
        self.y = y
        return self
    
    def set_pipelines(self, pipelines):
        self.pipelines = pipelines
        return self
    
    def set_metrics(self, metrics):
        self.metrics = metrics
        return self
    
    def set_models(self, models):
        self.models = models
        return self
    
    def set_nested_cv(self, tuning_method, hpars, inner_cv = None, outer_cv = None, rscv_random_state = None):
        self.tuning_method = tuning_method
        self.hpars = hpars
        self.inner_cv = inner_cv
        self.outer_cv = outer_cv
        self.rscv_random_state = rscv_random_state 
        return self
    
    def __get_performance_from_algorithm(self):
        
        print('Hyper-parameters optimization method: {}'.format(self.tuning_method))
        
        # Matrix score
        matrix_score = np.array(np.zeros(len(self.models)*len(self.metrics))).reshape(len(self.models), len(self.metrics))
            
        for i, (k, v) in enumerate(self.pipelines.items()):

            pipe = v 

            # Hyper-params optimization

            if self.tuning_method == 'GridSearchCV':

                try:
                    clf = GridSearchCV(estimator=pipe, param_grid=self.hpars[k], cv=self.inner_cv) 
                except:
                    clf = pipe # Default pipe

            elif self.tuning_method == 'RandomizedSearchCV':
                try:
                    clf = RandomizedSearchCV(estimator=pipe, param_distributions=self.hpars[k], cv=self.inner_cv, 
                                             random_state=self.rscv_random_state) 
                except:
                    clf = pipe # Default pipe

            # Nested-CV scores
            cv_results = cross_validate(clf, self.X, self.y, scoring = self.metrics, cv=self.outer_cv)
            for j, metric in enumerate(self.metrics):
                matrix_score[i,j] = round(cv_results['test_' + metric].mean()*100, 2)

        return matrix_score
    
    def get_table(self):
        
        matrix_score = self.__get_performance_from_algorithm()
        tab = pd.DataFrame(matrix_score, columns = self.metrics).set_axis(self.models).rename_axis('Model')
        return tab
    
    @staticmethod
    def info_class():
        
        info = print("""
        
        class name -> Judge
        
        -methods
        
        set_data -> X = features, y = targets
        set_pipelines -> pipelines (dict)
        set_metrics -> metrics (string list)
        set_models -> models (string list)
        get_table -> return DataFrame with evaluated metrics for each model
        
        -class parameters
        
        tuning_method -> 'GridSearchCV' or 'RandomizedSearchCV'
        hpars -> hyperparameters input for the selected tuning_method (set hpars = {} for no nested-CV)
        inner_cv -> hyperparameter tuning cross-validation splitting strategy
        outer_cv -> model selection cross-validation splitting strategy
        
        
        """)
        
        return info


# # Judge settings

random_state = 51
rscv_random_state = 51

pipelines = {
    
    'pipe_lr' : Pipeline(steps=[
                ('s2f', String2UniqueFeature(column = 'f_27', str_leng = 10)),
                ('sc', ScalerColumns(scaler='StandardScaler')),
                ('outlier', FunctionSampler(func=outlier_rejection_with_model, kw_args={'max_samples': 'auto',
                                                                                        'contamination': 'auto'})), 
                ('us', RandomUnderSampler()),
                ('clf', LogisticRegression(random_state=random_state))
                ], verbose = True),
    
    'pipe_ab' : Pipeline(steps=[
                ('s2f', String2UniqueFeature(column = 'f_27', str_leng = 10)),
                ('us', RandomUnderSampler()),
                ('clf', AdaBoostClassifier(random_state=random_state))
                ], verbose = True),
    
    'pipe_xgb' : Pipeline(steps=[
                ('s2f', String2UniqueFeature(column = 'f_27', str_leng = 10)),
                ('us', RandomUnderSampler()),
                ('clf', XGBClassifier(use_label_encoder = False))
                ], verbose = True),
    
}

models = ['pipe_lr', 'pipe_ab', 'pipe_xgb']

metrics= ['accuracy', 'roc_auc', 'precision', 'recall']

tuning_method = 'GridSearchCV'
# tuning_method = 'RandomizedSearchCV'

grid_lr = {'clf__solver' : ['liblinear']}
grid_xgb = {'clf__eval_metric' : ['logloss']}

hpars= {'pipe_lr' : grid_lr, 'pipe_xgb' : grid_xgb}

# hpars = {}

# KFold_params
inner_n_splits = 2 
outer_n_splits = 5
shuffle = True 

inner_cv = KFold(n_splits=inner_n_splits, shuffle=shuffle, random_state=random_state)
outer_cv = KFold(n_splits=outer_n_splits, shuffle=shuffle, random_state=random_state)

judge = Judge().set_data(X,y).set_pipelines(pipelines).set_metrics(metrics).set_models(models)
judge.set_nested_cv(tuning_method, hpars, inner_cv, outer_cv)
judge.get_table()

Hyper-parameters optimization method: GridSearchCV
[Pipeline] ............... (step 1 of 5) Processing s2f, total=   2.4s
[Pipeline] ................ (step 2 of 5) Processing sc, total=   0.8s
[Pipeline] ........... (step 3 of 5) Processing outlier, total=  38.6s
[Pipeline] ................ (step 4 of 5) Processing us, total=   0.7s
[Pipeline] ............... (step 5 of 5) Processing clf, total=   1.7s
[Pipeline] ............... (step 1 of 5) Processing s2f, total=   3.1s
[Pipeline] ................ (step 2 of 5) Processing sc, total=   0.7s
[Pipeline] ........... (step 3 of 5) Processing outlier, total=  35.1s
[Pipeline] ................ (step 4 of 5) Processing us, total=   0.5s
[Pipeline] ............... (step 5 of 5) Processing clf, total=   1.5s
[Pipeline] ............... (step 1 of 5) Processing s2f, total=   5.9s
[Pipeline] ................ (step 2 of 5) Processing sc, total=   1.6s
[Pipeline] ........... (step 3 of 5) Processing outlier, total= 1.2min
[Pipeline] ...............

[Pipeline] ................ (step 2 of 3) Processing us, total=   1.8s
[Pipeline] ............... (step 3 of 3) Processing clf, total= 2.5min
[Pipeline] ............... (step 1 of 3) Processing s2f, total=   3.0s
[Pipeline] ................ (step 2 of 3) Processing us, total=   0.9s
[Pipeline] ............... (step 3 of 3) Processing clf, total= 1.1min
[Pipeline] ............... (step 1 of 3) Processing s2f, total=   3.2s
[Pipeline] ................ (step 2 of 3) Processing us, total=   0.7s
[Pipeline] ............... (step 3 of 3) Processing clf, total= 1.1min
[Pipeline] ............... (step 1 of 3) Processing s2f, total=   6.3s
[Pipeline] ................ (step 2 of 3) Processing us, total=   1.6s
[Pipeline] ............... (step 3 of 3) Processing clf, total= 2.5min
[Pipeline] ............... (step 1 of 3) Processing s2f, total=   3.3s
[Pipeline] ................ (step 2 of 3) Processing us, total=   1.1s
[Pipeline] ............... (step 3 of 3) Processing clf, total= 1.2min
[Pipel

Unnamed: 0_level_0,accuracy,roc_auc,precision,recall
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pipe_lr,68.89,75.46,67.86,68.5
pipe_ab,72.02,79.29,71.16,71.42
pipe_xgb,91.87,97.44,91.77,91.5


### Production model

In [3]:
clf_prod = GridSearchCV(pipelines['pipe_xgb'], hpars['pipe_xgb'])
clf_prod.fit(X, y) # whole training-dataset
clf_prod.best_estimator_

[Pipeline] ............... (step 1 of 3) Processing s2f, total=   5.9s
[Pipeline] ................ (step 2 of 3) Processing us, total=   1.4s
[Pipeline] ............... (step 3 of 3) Processing clf, total= 2.5min
[Pipeline] ............... (step 1 of 3) Processing s2f, total=   6.1s
[Pipeline] ................ (step 2 of 3) Processing us, total=   1.4s
[Pipeline] ............... (step 3 of 3) Processing clf, total= 2.5min
[Pipeline] ............... (step 1 of 3) Processing s2f, total=   6.4s
[Pipeline] ................ (step 2 of 3) Processing us, total=   1.4s
[Pipeline] ............... (step 3 of 3) Processing clf, total= 2.5min
[Pipeline] ............... (step 1 of 3) Processing s2f, total=   6.1s
[Pipeline] ................ (step 2 of 3) Processing us, total=   1.4s
[Pipeline] ............... (step 3 of 3) Processing clf, total= 2.5min
[Pipeline] ............... (step 1 of 3) Processing s2f, total=   6.4s
[Pipeline] ................ (step 2 of 3) Processing us, total=   1.8s
[Pipel

Pipeline(steps=[('s2f', String2UniqueFeature(column='f_27', str_leng=10)),
                ('us', RandomUnderSampler()),
                ('clf',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               eval_metric='logloss', gamma=0, gpu_id=-1,
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=8, num_parallel_tree=1, predictor='auto',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=1,
                               tree_met

In [4]:
# save the model to disk

filename = 'may2022_kaggle_comp.pkl'
joblib.dump(clf_prod, open(filename, 'wb'))

In [14]:
# pipeline diagram

set_config(display="diagram")
clf_prod.best_estimator_

### Datatest predictions

In [6]:
df_test_raw = pd.read_csv('test.csv')
df_test = df_test_raw.copy()
X_test = df_test.drop(columns = ['id'], axis=1)
X_test

Unnamed: 0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,...,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30
0,0.442517,0.174380,-0.999816,0.762741,0.186778,-1.074775,0.501888,6,6,0,...,-1.006400,-1.193879,-2.435736,-2.427430,-1.966887,5.734205,BAAABADLAC,99.478419,0,0
1,-0.605598,-0.305715,0.627667,-0.578898,-1.750931,1.355550,-0.190911,1,3,4,...,2.382405,0.149442,1.883322,-2.848714,-0.725155,3.194219,AFABBAEGCB,-65.993825,1,0
2,0.303990,2.445110,0.246515,0.818248,0.359731,-1.331845,1.358622,3,3,4,...,-7.026098,1.312277,-5.157192,1.714005,0.585032,0.066898,BBACABBKEE,-87.405622,0,1
3,0.154053,0.260126,-1.367092,-0.093175,-1.111034,-0.948481,1.119220,0,0,4,...,-0.594532,-3.939475,1.754570,-2.364007,-1.003320,3.893099,AEBEAACQCC,-281.293460,0,0
4,-1.651904,-0.424266,-0.667356,-0.322124,-0.089462,0.181705,1.784983,2,2,2,...,0.084906,-0.985736,-0.130467,-3.557893,1.210687,1.861884,AEBBBBDABF,25.629415,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699995,0.640110,0.897808,-0.523956,1.563760,-0.092281,-0.610867,0.535426,0,1,6,...,2.604048,1.122867,0.518110,1.243837,0.575111,0.076372,BCBCEBHMCD,204.186539,0,0
699996,-0.191771,-0.035246,-0.118533,0.584750,2.126977,0.568659,-0.052663,4,3,4,...,3.029857,1.384682,-1.135740,2.982713,-1.511760,2.225218,BAABCADQFC,-97.694591,0,2
699997,-0.331704,-0.328845,-1.185503,1.022128,-0.483099,-0.107146,-0.968281,1,1,2,...,4.021273,-1.845266,1.096011,-2.734508,-4.885955,-2.248739,AAAJCBGQBA,130.622745,1,0
699998,-2.031073,-1.238398,0.964699,-1.045950,0.906064,0.634301,-0.707474,5,1,1,...,1.453864,-1.696606,1.018995,1.973697,-0.353068,-3.333449,BCBBCABNDE,-364.625148,0,0


In [7]:
# predictions
preds_array = np.round(clf_prod.predict_proba(X_test)[:, 1], 2)
preds = pd.DataFrame(preds_array).set_axis(['target'], axis= 'columns') # target probability = 1

In [8]:
submission = pd.concat([df_test_raw.id, preds], axis=1)

In [9]:
submission

Unnamed: 0,id,target
0,900000,0.94
1,900001,0.94
2,900002,0.00
3,900003,0.07
4,900004,0.95
...,...,...
699995,1599995,0.58
699996,1599996,0.98
699997,1599997,0.37
699998,1599998,0.08


In [10]:
submission.to_csv('submission.csv', index=False)