# Tabular Playground Series - Aug 2022

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [32]:
train_df = pd.read_csv("../input/tabular-playground-series-aug-2022/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-aug-2022/test.csv")

In [33]:
print(train_df.shape)
print(test_df.shape)

(26570, 26)
(20775, 25)


In [34]:
display(train_df.head())
display(test_df.head())

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,26570,F,119.57,material_5,material_6,6,4,6,9,6,...,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612
1,26571,F,113.51,material_5,material_6,6,4,11,8,0,...,19.368,12.032,13.998,,12.473,17.468,16.708,14.776,14.102,537.037
2,26572,F,112.16,material_5,material_6,6,4,8,12,4,...,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995
3,26573,F,112.72,material_5,material_6,6,4,8,11,10,...,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301
4,26574,F,208.0,material_5,material_6,6,4,14,16,8,...,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044


In [35]:
measurement_cols = [i for i in train_df.columns if "measurement" in i]

In [36]:
import pickle
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import LinearSVC
import sklearn


In [37]:
#remove the features we don't need
train_df = train_df.drop(columns=["id"])
test_df = test_df.drop(columns=["id"])

In [38]:
X = train_df.drop(columns="failure")
y = train_df["failure"]

X_test = test_df

In [39]:
def impute_and_scale(train_data, val_data):
    """
    impute and scale the data
    """
    scaler = PowerTransformer()
    imputer = SimpleImputer(strategy="mean")
    
    #generate new data
    train_data['m_3_missing'] = train_data.measurement_3.isna()
    train_data['m_5_missing'] = train_data.measurement_5.isna()
    val_data['m_3_missing'] = val_data.measurement_3.isna()
    val_data['m_5_missing'] = val_data.measurement_5.isna()
    
    imputer.fit(train_data[measurement_cols + ["loading"] + ["m_3_missing"] + ["m_5_missing"]])
    
    filled_train = imputer.transform(train_data[measurement_cols + ["loading"] + ["m_3_missing"] + ["m_5_missing"]])
    filled_val = imputer.transform(val_data[measurement_cols + ["loading"] + ["m_3_missing"] + ["m_5_missing"]])
    
    scaled_train = scaler.fit_transform(filled_train)
    scaled_val = scaler.transform(filled_val)
    
    #back to dataframe
    new_train = train_data.copy()
    new_val = val_data.copy()
    
    new_train[measurement_cols + ["loading"] + ["m_3_missing"] + ["m_5_missing"]] = filled_train
    new_val[measurement_cols + ["loading"] + ["m_3_missing"] + ["m_5_missing"]] = filled_val
    
    
    return new_train, new_val
    

In [40]:
def _ohe(train_data, val_data):
    """
    one-hot-encodes the data
    """
    
    new_train = pd.get_dummies(train_data, columns=["product_code","attribute_0", "attribute_1", "attribute_2", "attribute_3"])
    new_val = pd.get_dummies(val_data, columns=["product_code","attribute_0", "attribute_1", "attribute_2", "attribute_3"])
    
    #columns are not currently the same, concat so that they are
    train_val = pd.concat([new_train, new_val]).fillna(0) #creates some empty columns, fill these with 0's
    
    #extract train and val again
    new_train = train_val.iloc[0:len(train_data)]
    new_val = train_val.iloc[len(train_data):]
    
    assert len(train_data) == len(new_train)
    assert len(val_data) == len(val_data)
    
    return new_train, new_val
    

In [41]:
def k_fold_cv(model,X,y):
    kfold = GroupKFold(n_splits=5)

    feature_imp, y_pred_list, y_true_list, roc_list  = [],[],[],[]
    for fold, (train_index, val_index) in enumerate(kfold.split(X, y, train_df["product_code"])):
        print("===== fold", fold, "=====")
        X_train = X.loc[train_index]
        X_val = X.loc[val_index]

        y_train = y.loc[train_index]
        y_val = y.loc[val_index]
            
        #impute and cale
        X_train, X_val = impute_and_scale(X_train, X_val)
            
        #encode categorical variables
        X_train, X_val = _ohe(X_train, X_val)
        pca = PCA(n_components=0.96, iterated_power=1)
        X_train = pca.fit_transform(X_train)
        X_val = pca.transform(X_val)

        
        model.fit(X_train,y_train)
            
        y_pred = model.predict_proba(X_val)[:,1]
            
        y_pred_list = np.append(y_pred_list, y_pred)
        y_true_list = np.append(y_true_list, y_val)
        
        roc_list.append(roc_auc_score(y_val,y_pred))
        print("roc auc", roc_auc_score(y_val,y_pred))
            
        
    return feature_imp, y_pred_list, y_true_list, roc_list, X_val, y_val, model

In [42]:
adb = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=10,max_depth=4),n_estimators=10,learning_rate=0.6)
rf = RandomForestClassifier(n_estimators=10, max_depth=10)
# evc=VotingClassifier(estimators=[('mnb',mnb),('lr',lr),('rf',rf),('svm',svm)],voting='hard')
# mnb = MultinomialNB().fit(x_train, y_train)
clf=sklearn.svm.SVC(kernel='linear',probability=True)
model = LogisticRegression(penalty='elasticnet', l1_ratio=0.85, C=0.005, tol = 1e-2, solver='saga', max_iter=3000, random_state=5, class_weight='balanced')
bg=BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5,max_features=0.6,n_estimators=30)

In [43]:
%%time
feature_imp, y_pred_list, y_true_list, roc_list, X_val, y_val, m = k_fold_cv(model=model,X=X,y=y)
with open('/kaggle/working/model.pickle', 'wb') as f:
    pickle.dump(m, f)

===== fold 0 =====
roc auc 0.5878371115800105
===== fold 1 =====
roc auc 0.5792870949582619
===== fold 2 =====
roc auc 0.5889815809447555
===== fold 3 =====
roc auc 0.5985753147482015
===== fold 4 =====
roc auc 0.595864059589909
CPU times: user 5.46 s, sys: 1.91 s, total: 7.37 s
Wall time: 4.36 s
