In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from joblib import dump

In [4]:
# read in data
train_data = pd.read_csv('train.csv', index_col='id')

# feature lists for preprocessing and training
missing_list = [feature for feature in train_data.columns if feature == 'loading' or feature.startswith('measurement')]
aggregate_list = [feature for feature in train_data.columns if train_data[feature].dtypes == 'float64' and feature != 'loading']
chosen_features = ['loading', 'measurement_0', 'measurement_1', 'measurement_2', 'measurement_17', 'attribute_0', 'm3_missing', 'm5_missing', 'area', 'm3_17_avg', 'm3_17_stdev']

In [5]:
# preprocess data features

# add indicators for missing values in measurement 3 & 5
train_data['m3_missing'] = train_data.measurement_3.isna()
train_data['m5_missing'] = train_data.measurement_5.isna()

# fill in missing values
imputer = KNNImputer(n_neighbors=15)
imputer.fit(train_data[missing_list])
train_data[missing_list] = imputer.transform(train_data[missing_list])

# label encode string features: product_code, attribute_0, attribute_1
train_data['product_code'] = LabelEncoder().fit_transform(train_data['product_code'])
train_data['attribute_0'] = LabelEncoder().fit_transform(train_data['attribute_0'])
train_data['attribute_1'] = LabelEncoder().fit_transform(train_data['attribute_1'])

# clip measurement_2 values that are below 11
train_data['measurement_2'] = train_data['measurement_2'].clip(11, None)

# add area feature by multiplying attribute_2 and attribute_3
train_data['area'] = train_data['attribute_2'] * train_data['attribute_3']

# add average and stdev of features, measurement_3 ~ measurement_17
train_data['m3_17_avg'] = train_data[aggregate_list].mean(axis=1)
train_data['m3_17_stdev'] = train_data[aggregate_list].std(axis=1)

# standardize all features
train_data[chosen_features] = StandardScaler().fit_transform(train_data[chosen_features])

In [6]:
# check performance via group kfold cross validation
AUC = []
GKF = GroupKFold(n_splits=5)
for fold, (train_index, val_index) in enumerate(GKF.split(train_data, train_data.failure, train_data.product_code)):
    
    # training set
    x_train = train_data.iloc[train_index][train_data.columns]
    y_train = train_data.iloc[train_index].failure
    
    # validation set
    x_val = train_data.iloc[val_index][train_data.columns]
    y_val = train_data.iloc[val_index].failure
    
    model = LogisticRegression(penalty='l1', C=0.01, solver='liblinear', random_state=1)
    model.fit(x_train[chosen_features], y_train)
    
    # validation
    y_pred = model.predict_proba(x_val[chosen_features])[:,1]
    score = roc_auc_score(y_val, y_pred)
    print(f"Fold {fold}: {score:.5f}")
    AUC.append(score)
    
print(f"Average AUC = {sum(AUC) / len(AUC):.5f}")

Fold 0: 0.58763
Fold 1: 0.58258
Fold 2: 0.59069
Fold 3: 0.59800
Fold 4: 0.59583
Average AUC = 0.59094


In [7]:
# fit model & output weights
model = LogisticRegression(penalty='l1', C=0.01, solver='liblinear', random_state=1)
model.fit(train_data[chosen_features], train_data.failure)
dump(model, 'weights.joblib') 

['weights.joblib']