# Created by yunsuxiaozi

In [1]:
#https://zhuanlan.zhihu.com/p/425667868
#https://www.kaggle.com/code/alexryzhkov/ps3e25-lightautoml-baseline
#https://www.kaggle.com/code/yunsuxiaozi/pss3e25-study-note
#这里是定义Jupyter Notebook的外观样式
from IPython.core.display import display, HTML, Javascript

# ----- Notebook Theme -----
color_map = ['#16a085', '#e8f6f3', '#d0ece7', '#a2d9ce', '#73c6b6', '#45b39d', 
                        '#16a085', '#138d75', '#117a65', '#0e6655', '#0b5345']

prompt = color_map[-1]
main_color = color_map[0]
strong_main_color = color_map[1]
custom_colors = [strong_main_color, main_color]

css_file = ''' 

div #notebook {
background-color: white;
line-height: 20px;
}

#notebook-container {
%s
margin-top: 2em;
padding-top: 2em;
border-top: 4px solid %s; /* light orange */
-webkit-box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
    box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
}

div .input {
margin-bottom: 1em;
}

.rendered_html h1, .rendered_html h2, .rendered_html h3, .rendered_html h4, .rendered_html h5, .rendered_html h6 {
color: %s; /* light orange */
font-weight: 600;
}

div.input_area {
border: none;
    background-color: %s; /* rgba(229, 143, 101, 0.1); light orange [exactly #E58F65] */
    border-top: 2px solid %s; /* light orange */
}

div.input_prompt {
color: %s; /* light blue */
}

div.output_prompt {
color: %s; /* strong orange */
}

div.cell.selected:before, div.cell.selected.jupyter-soft-selected:before {
background: %s; /* light orange */
}

div.cell.selected, div.cell.selected.jupyter-soft-selected {
    border-color: %s; /* light orange */
}

.edit_mode div.cell.selected:before {
background: %s; /* light orange */
}

.edit_mode div.cell.selected {
border-color: %s; /* light orange */

}
'''
def to_rgb(h): 
    return tuple(int(h[i:i+2], 16) for i in [0, 2, 4])

main_color_rgba = 'rgba(%s, %s, %s, 0.1)' % (to_rgb(main_color[1:]))
open('notebook.css', 'w').write(css_file % ('width: 95%;', main_color, main_color, main_color_rgba, main_color,  main_color, prompt, main_color, main_color, main_color, main_color))

def nb(): 
    return HTML("<style>" + open("notebook.css", "r").read() + "</style>")
nb()

  from IPython.core.display import display, HTML, Javascript


## Import necessary libraries

In [2]:
#necessary
import pandas as pd#导入csv文件的库
import numpy as np#进行矩阵运算的库
from sklearn.feature_extraction.text import TfidfVectorizer#导入tf-idf模型
from sklearn.decomposition import TruncatedSVD#基于奇异值分解的降维方法:截断SVD
#设置随机种子,保证模型可以复现
import random
import warnings#避免一些可以忽略的报错
warnings.filterwarnings('ignore')#filterwarnings()方法是用于设置警告过滤器的方法，它可以控制警告信息的输出方式和级别。

## Config and set random seed.

In [3]:
#config
class Config():
    origin_path=None
    train_path='/kaggle/input/playground-series-s4e3/train.csv'
    test_path='/kaggle/input/playground-series-s4e3/test.csv'
    submission_path='/kaggle/input/playground-series-s4e3/sample_submission.csv'
    seed=2024
    num_folds=10
    TARGET_NAME =['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']

## preprocessor

In [4]:
#一个处理数据的通用类,用于打playground比赛
class Preprocessor:#数据预处理的一个类
    def __init__(self,train_path=None,origin_path=None,test_path=None,TARGET_NAME ='target',seed=2024):
        self.seed=seed
        #设置随机种子,保证模型可以复现
        np.random.seed(self.seed)
        random.seed(self.seed)
        #需要预测的目标列
        self.TARGET_NAME=TARGET_NAME 
        
        if train_path!=None:#一般都有训练数据的路径
            self.train_df=pd.read_csv(train_path)
            self.train_df.drop(['id'],axis=1,inplace=True)
        else:#没有训练数据的路径就报错
            assert 0#assert 0==1?
        
        if origin_path!=None:#如果打算使用原始数据,没有原始数据也无所谓
            origin_df=pd.read_csv(origin_path)
            print(f"len(origin_df):{len(origin_df)}")
            #如果有原始数据,则把训练数据和原始数据拼接
            self.train_df=pd.concat((self.train_df,origin_df),axis=0)
            print(f"len(self.train_df):{len(self.train_df)}")
        self.train_df.fillna(method="ffill",inplace=True)
        self.train_df=self.train_df.drop_duplicates()
        print(f"len(self.train_df):{len(self.train_df)}")
            
        if test_path!=None:#一般都有训练数据的路径
            self.test_df=pd.read_csv(test_path)
            self.test_df.drop(['id'],axis=1,inplace=True)
            self.test_df.fillna(method="ffill",inplace=True)#用上一个数据来填充
        else:#没有训练数据的路径就报错
            assert 0#assert 0==1?
        #浮点数类型的连续型变量
        self.num_cols=[]
        #类别型变量,一般类别较少
        self.cat_cols=[]
        #一般就是像姓名那样不是类别型的字符串了
        self.str_cols=[]
        for col in self.train_df.columns:
            if type(self.TARGET_NAME)==list:#如果是列表,不是一个字符串(也就是多列而不是一列)
                #不在列表里同时 数据类型是object就是字符串
                if (col not in self.TARGET_NAME) and (self.train_df[col].dtype=='object'):
                    self.str_cols.append(col)
                #不是字符串,不在target的列表里,类别有限,那就是类别型变量
                elif (col not in self.TARGET_NAME) and  (self.train_df[col].nunique()<50):
                    self.cat_cols.append(col)
                elif (col not in self.TARGET_NAME):
                    self.num_cols.append(col)
            else:#target只有一列
                #不在列表里同时 数据类型是object就是字符串
                if (col!=self.TARGET_NAME) and (self.train_df[col].dtype=='object'):
                    self.str_cols.append(col)
                #不是字符串,不是target,类别有限,那就是类别型变量
                elif (col!=self.TARGET_NAME) and  (self.train_df[col].nunique()<50):
                    self.cat_cols.append(col)
                elif (col!=self.TARGET_NAME):
                    self.num_cols.append(col)
               
        print(f"num_cols:{self.num_cols}.\ncat_cols:{self.cat_cols}.\nstr_cols:{self.str_cols} ")
        
    #计算两组变量的皮尔逊相关系数
    def pearson_corr(self,x1,x2):
        #x1,x2 np.array
        eps=1e-15
        mean_x1=np.mean(x1)
        mean_x2=np.mean(x2)
        std_x1=np.std(x1)
        std_x2=np.std(x2)
        pearson=np.mean((x1-mean_x1)*(x2-mean_x2))/(std_x1*std_x2+eps)
        return pearson
    
    #用于进行探索性数据分析的函数.
    def EDA(self,):
        print("num_cols VS cat_cols:")
        for num_col in self.num_cols:
            for cat_col in self.cat_cols:
                unique_value=self.train_df[cat_col].unique()
                for value in unique_value:
                    tmp_df=self.train_df[self.train_df[cat_col]==value]
                    print(f"{cat_col}=={value}:mean_{num_col}=={tmp_df[num_col].mean()}")
                print("-"*50)
            print()
        print("num_cols VS num_cols:")
        for i in range(len(self.num_cols)):
            for j in range(i+1,len(self.num_cols)):
                x1=self.train_df[self.num_cols[i]].values
                x2=self.train_df[self.num_cols[j]].values
                if abs(self.pearson_corr(x1,x2))>0.9:
                    print(f"{self.num_cols[i]} and {self.num_cols[j]} have strong linear correlation.")
                    
        #如果是分类任务
        if self.train_df[self.TARGET_NAME].nunique()<50:
            print("TARGET VS cat_cols:")
            unique_target=self.train_df[self.TARGET_NAME].unique()
            for target in unique_target:
                for cat_col in self.cat_cols:
                    unique_value=self.train_df[cat_col].unique()
                    for value in unique_value:
                        #当cat_col==value时,target有没有唯一值
                        tmp_df=self.train_df[self.train_df[cat_col]==value]
                        #如果target基本只有它一个值
                        if len(tmp_df[tmp_df[self.TARGET_NAME]==target])>0.99*len(tmp_df):
                            print(f"when {cat_col}={value},{self.TARGET_NAME}={target}")
                            
                        tmp_df=self.train_df[self.train_df[self.TARGET_NAME]==target]
                        if len(tmp_df[tmp_df[cat_col]==value])>0.99*len(tmp_df):
                            print(f"when {self.TARGET_NAME}={target},{cat_col}={value}")   
            print("-"*50)
            print("TARGET VS num_cols:")
            for num_col in self.num_cols:
                for target in unique_target:
                    tmp_df=self.train_df[self.train_df[self.TARGET_NAME]==target]
                    print(f"when {self.TARGET_NAME}={target},mean_{num_col}={tmp_df[num_col].mean()}")  
                print("-"*50)      
        else:#如果是回归任务的话,那肯定是数值类型的target
            print("TARGET VS cat_cols:")
            for target in unique_target:
                for cat_col in self.cat_cols:
                    unique_value=self.train_df[cat_col].unique()
                    for value in unique_value:
                        tmp_df=self.train_df[self.train_df[cat_col]==value]
                        print(f"when {cat_col}={value},mean_{self.TARGET_NAME}={tmp_df[self.TARGET_NAME].mean()}")  
                print("-"*50)   
            
                
    
    #对训练和测试数据某列字符串列构造tf-idf特征,取n个特征,最后降维为p个特征
    def tf_idf(self,train, test, column,n,p):
        print(f"tf-idf done with {column}")
        vectorizer=TfidfVectorizer(max_features=n)#创建tf-idf模型,取出最重要的n个特征
        #对训练数据的列进行fit,得到训练数据和测试数据的向量.
        vectors_train=vectorizer.fit_transform(train[column])
        vectors_test=vectorizer.transform(test[column])

        svd=TruncatedSVD(p)#创建截断SVD,降维到P维
        x_pca_train=svd.fit_transform(vectors_train)#对训练向量和测试向量进行降维
        x_pca_test=svd.transform(vectors_test)
        tfidf_df_train=pd.DataFrame(x_pca_train)#转成表格型数据
        print(len(tfidf_df_train))
        tfidf_df_test=pd.DataFrame(x_pca_test)
        
        #对列名进行调整 ‘Surname_tfidf_{idx}’
        cols=[(column+"_tfidf_"+str(f)) for f in tfidf_df_train.columns]
        tfidf_df_train.columns=cols
        tfidf_df_test.columns=cols
        #按列拼接在一起.
        train=pd.concat([train,tfidf_df_train], axis=1)
        test=pd.concat([test,tfidf_df_test], axis=1)
        train.drop([column],axis=1,inplace=True)
        test.drop([column],axis=1,inplace=True)
        return train, test
    #对类别型字符串进行独热编码
    def one_hot_encoder(self,total_df):
        print(f"one hot encoder with {self.cat_cols}")
        for col in self.cat_cols:
            if total_df[col].nunique()==2:#如果类别数量等于2的话,有一列onehot就行了.
                value=total_df[col].values[0]
                total_df[col+"_"+str(value)]=(total_df[col]==value)
                total_df.drop([col],axis=1,inplace=True)
            else:#如果类别数量很多的话,一个一个来
                values=total_df[col].unique()
                for value in values:
                    total_df[col+"_"+str(value)]=(total_df[col]==value)
                total_df.drop([col],axis=1,inplace=True)
        return total_df          
    #这个可能每次比赛都要具体情况具体分析.
    def make_feats(self):
        total_df=pd.concat((self.train_df,self.test_df),axis=0)
        print(f"len(total_df):{len(total_df)}")
        print("feature engineer")
        total_df['X_gap']=total_df['X_Maximum']-total_df['X_Minimum']
        total_df['Y_gap']=total_df['Y_Maximum']-total_df['Y_Minimum']
        total_df['X_Y']=total_df['X_gap']*total_df['Y_gap']
        total_df['Log_X_Y_Index']=total_df['Log_X_Index']*total_df['Log_Y_Index']
        total_df['gap_of_Luminosity']=total_df['Maximum_of_Luminosity']-total_df['Minimum_of_Luminosity']
        #对字符串特征列进行独热编码的转换
        print("----------string one hot encoder ****")
        total_df=self.one_hot_encoder(total_df)
        print("-"*50)
        train_feats=total_df[:len(self.train_df)].reset_index(drop=True)
        test_feats=total_df[len(self.train_df):].reset_index(drop=True)
        for col in self.str_cols:
            train_feats,test_feats=self.tf_idf(train_feats, test_feats, column=col,n=int(0.5*train_feats[col].nunique()),p=3)
            print(f"len(train_feats):{len(train_feats)}")
        #如果是字符串的列或者一列只有唯一值,去掉
        print("----------drop other string or unique value full null value ****")
        drop_cols=[]
        for col in test_feats.columns:
            if (train_feats[col].dtype=='object') or (train_feats[col].nunique()==1) or train_feats[col].isna().mean()>0.95:
                drop_cols+=[col]
        print(f"drop_cols:{drop_cols}")
        train_feats=train_feats.drop(drop_cols,axis=1)
        test_feats=test_feats.drop(drop_cols,axis=1)
        
        print("-"*50)
        print(f"Done!! total_features:{len(train_feats.keys().values)-1}")
        return train_feats,test_feats

    
    def submission(self,submission_path=None,test_pred=None):
        if submission_path!=None:#一般都有训练数据的路径
            self.submission_df=pd.read_csv(submission_path)
        else:#没有提交文件的路径就报错
            assert 0#assert 0==1?
        self.submission_df[self.TARGET_NAME]=test_pred
        self.submission_df.to_csv("submission.csv",index=None)
        return self.submission_df

In [5]:
print("Import data.")
preprocessor=Preprocessor(Config.train_path,Config.origin_path,Config.test_path,Config.TARGET_NAME,Config.seed)
print("-"*50)
print("make features.")
train_feats,test_feats=preprocessor.make_feats()
print("-"*50)

train_feats['target']=0
for idx in range(len(Config.TARGET_NAME)):
    train_feats['target']+=idx*train_feats[Config.TARGET_NAME[idx]]
train_feats.drop(Config.TARGET_NAME,axis=1,inplace=True)
test_feats.drop(Config.TARGET_NAME,axis=1,inplace=True)
train_feats['target']=train_feats['target'].astype(int)
test_feats['target']=0
train_feats.head()

Import data.
len(self.train_df):19219
num_cols:['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas'].
cat_cols:['TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Outside_Global_Index'].
str_cols:[] 
--------------------------------------------------
make features.
len(total_df):32033
feature engineer
----------string one hot encoder ****
one hot encoder with ['TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Outside_Global_Index']
--------------------------------------------------
----------drop other string or unique value full null value ****
drop_cols:[]
--------------------------------------------------
Done!! total_

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Steel_Plate_Thickness_211,Steel_Plate_Thickness_290,Steel_Plate_Thickness_159,Steel_Plate_Thickness_81,Steel_Plate_Thickness_86,Outside_Global_Index_0.0,Outside_Global_Index_1.0,Outside_Global_Index_0.5,Outside_Global_Index_0.7,target
0,584,590,909972,909977,16,8,5,2274,113,140,...,False,False,False,False,False,True,False,False,False,3
1,808,816,728350,728372,433,20,54,44478,70,111,...,False,False,False,False,False,False,True,False,False,6
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,...,False,False,False,False,False,True,False,False,False,2
3,781,789,3353146,3353173,210,16,29,3202,114,134,...,False,False,False,False,False,False,True,False,False,2
4,1540,1560,618457,618502,521,72,67,48231,82,111,...,False,False,False,False,False,False,True,False,False,6


## fusion model

In [6]:
#model
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

#KFold是直接分成k折,StratifiedKFold还要考虑每种类别的占比
from sklearn.model_selection import StratifiedKFold

def fit_and_predict(model,target=Config.TARGET_NAME):
    X=train_feats.drop([target],axis=1).copy()
    y=train_feats[target].copy()
    test_X=test_feats.drop([target],axis=1).copy()
    oof_pred_pro=np.zeros((len(X),len(y.unique())))
    test_pred_pro=np.zeros((Config.num_folds,len(test_X),len(y.unique()) ))
    #10折交叉验证
    skf = StratifiedKFold(n_splits=Config.num_folds,random_state=Config.seed, shuffle=True)

    for fold, (train_index, valid_index) in (enumerate(skf.split(X, y.astype(str)))):
        print(f"fold:{fold}")

        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        model.fit(X_train,y_train)
        
        oof_pred_pro[valid_index]=model.predict_proba(X_valid)
        test_pred_pro[fold]=model.predict_proba(test_X)
    
    return oof_pred_pro,test_pred_pro

#选择的参数都来自这个notebook https://www.kaggle.com/code/ksevta/ps4e2-xgb-lgbm-0-92/notebook
lgb_params={
    "objective": "multiclass",          # Objective function for the model
    "metric": "multi_logloss",          # Evaluation metric
    "verbosity": -1,                    # Verbosity level (-1 for silent)
    "boosting_type": "gbdt",            # Gradient boosting type
    "random_state": Config.seed,       # Random state for reproducibility
    "num_class": 7,                     # Number of classes in the dataset
    'learning_rate': 0.030962211546832760,  # Learning rate for gradient boosting
    'n_estimators': 500,                # Number of boosting iterations
    'lambda_l1': 0.009667446568254372,  # L1 regularization term
    'lambda_l2': 0.04018641437301800,   # L2 regularization term
    'max_depth': 10,                    # Maximum depth of the trees
    'colsample_bytree': 0.40977129346872643,  # Fraction of features to consider for each tree
    'subsample': 0.9535797422450176,    # Fraction of samples to consider for each boosting iteration
    'min_child_samples': 26             # Minimum number of data needed in a leaf
}

xgb_params = {'grow_policy': 'depthwise', 'n_estimators': 982, 
               'learning_rate': 0.050053726931263504, 'gamma': 0.5354391952653927, 
               'subsample': 0.7060590452456204, 'colsample_bytree': 0.37939433412123275, 
               'max_depth': 23, 'min_child_weight': 21, 'reg_lambda': 9.150224029846654e-08,
               'reg_alpha': 5.671063656994295e-08,
               'booster':'gbtree','objective':'multi:softmax',"verbosity":0
              }
cat_params={'learning_rate': 0.13762007048684638, 'depth': 5, 
          'l2_leaf_reg': 5.285199432056192, 'bagging_temperature': 0.6029582154263095,
         'random_seed': Config.seed,'verbose': False,'iterations':1000}

print("random forest model")
rf_oof_pred_pro,rf_test_pred_pro=fit_and_predict(model=RandomForestClassifier(random_state=Config.seed),target='target')
print("lgb model")
lgb_oof_pred_pro,lgb_test_pred_pro=fit_and_predict(model= LGBMClassifier(**lgb_params,verbose=-1),target='target')
print("xgb model")
xgb_oof_pred_pro,xgb_test_pred_pro=fit_and_predict(model= XGBClassifier(**xgb_params,seed=Config.seed),target='target')
print("cat model")
cat_oof_pred_pro,cat_test_pred_pro=fit_and_predict(model= CatBoostClassifier(**cat_params),target='target')

random forest model
fold:0
fold:1
fold:2
fold:3
fold:4
fold:5
fold:6
fold:7
fold:8
fold:9
lgb model
fold:0
fold:1
fold:2
fold:3
fold:4
fold:5
fold:6
fold:7
fold:8
fold:9
xgb model
fold:0
fold:1
fold:2
fold:3
fold:4
fold:5
fold:6
fold:7
fold:8
fold:9
cat model
fold:0
fold:1
fold:2
fold:3
fold:4
fold:5
fold:6
fold:7
fold:8
fold:9


## blending

In [7]:
#评估指标是log_loss  best_w1:23,best_w2:43,best_w3:8
#best_accuracy:0.5805192777980124,best_log_loss:0.9906327298849478
def log_loss(y_true,y_pred):
    eps=10**(-15)
    y_true=np.clip(y_true,eps,1-eps)
    y_pred=np.clip(y_pred,eps,1-eps)
    return -np.mean(np.sum(y_true*np.log(y_pred),axis=-1))
def accuracy(y_true,y_pred):
    return np.mean(y_true==y_pred)
oof_target=train_feats['target'].values#Config.TARGET_NAME
oof_one_hot=np.eye(np.max(oof_target)+1)[oof_target]
step=100
best_w1=1
best_w2=1
best_w3=1
best_accuracy=0
best_log_loss=10
for w1 in range(1,step-1,1):
    for w2 in range(1,step-w1-1,1):
        for w3 in range(1,step-w1-w2-1,1):
            blend_oof_pred_pro=(w1*rf_oof_pred_pro+w2*lgb_oof_pred_pro+w3*xgb_oof_pred_pro+(step-w1-w2-w3)*cat_oof_pred_pro)/step
            current_log_loss=log_loss(oof_one_hot,blend_oof_pred_pro)
            current_accuracy=accuracy(oof_target,np.argmax(blend_oof_pred_pro,axis=1))
            if current_accuracy>best_accuracy:
                best_w1=w1
                best_w2=w2
                best_w3=w3
                best_accuracy=current_accuracy
                best_log_loss=current_log_loss
                print(f"best_w1:{best_w1},best_w2:{best_w2},best_w3:{best_w3},best_accuracy:{best_accuracy},best_log_loss:{best_log_loss}")
            elif (current_accuracy==best_accuracy) and current_log_loss<best_log_loss:
                best_w1=w1
                best_w2=w2
                best_w3=w3
                best_log_loss=current_log_loss
                print(f"best_w1:{best_w1},best_w2:{best_w2},best_w3:{best_w3},best_accuracy:{best_accuracy},best_log_loss:{best_log_loss}")

best_w1:1,best_w2:1,best_w3:1,best_accuracy:0.5716218325615277,best_log_loss:1.006195002516344
best_w1:1,best_w2:1,best_w3:2,best_accuracy:0.5718819917789687,best_log_loss:1.005920123778237
best_w1:1,best_w2:1,best_w3:9,best_accuracy:0.5719340236224569,best_log_loss:1.0045677681680114
best_w1:1,best_w2:1,best_w3:10,best_accuracy:0.5723502783703627,best_log_loss:1.0044336972314516
best_w1:1,best_w2:1,best_w3:11,best_accuracy:0.5726104375878037,best_log_loss:1.004312707307843
best_w1:1,best_w2:1,best_w3:15,best_accuracy:0.573182787866174,best_log_loss:1.0039549683963007
best_w1:1,best_w2:1,best_w3:16,best_accuracy:0.573182787866174,best_log_loss:1.0038962255688566
best_w1:1,best_w2:1,best_w3:17,best_accuracy:0.573390915240127,best_log_loss:1.0038494922738124
best_w1:1,best_w2:1,best_w3:19,best_accuracy:0.573390915240127,best_log_loss:1.003791645780852
best_w1:1,best_w2:1,best_w3:20,best_accuracy:0.5734949789271034,best_log_loss:1.0037803577474709
best_w1:1,best_w2:1,best_w3:21,best_accur

## Submission

In [8]:
blend_test_pred_pro=(best_w1*rf_test_pred_pro+best_w2*lgb_test_pred_pro+best_w3*xgb_test_pred_pro+(step-best_w1-best_w2-best_w3)*cat_test_pred_pro)/step
blend_test_pred_pro=blend_test_pred_pro.mean(axis=0)
print("submission")
submission=pd.read_csv(Config.submission_path)
for idx in range(len(Config.TARGET_NAME)):
    submission[Config.TARGET_NAME[idx]]=blend_test_pred_pro[:,idx]
submission.to_csv("baseline.csv",index=None)
submission.head()

submission


Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.492274,0.003693,0.001988,1.3e-05,0.017478,0.125169,0.359013
1,19220,0.303282,0.018033,0.003649,3.5e-05,0.163102,0.14758,0.364048
2,19221,0.098015,0.030024,0.037122,0.000963,0.003747,0.295728,0.534159
3,19222,0.179042,0.007938,0.000136,0.000952,0.005118,0.374053,0.432555
4,19223,0.043998,0.003655,0.000308,0.002418,0.002201,0.646379,0.300853


There are multiple faults or no faults here, and models will be trained separately for each category and predicted in the future.