In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from joblib import load

In [3]:
# read in test.csv
test_data = pd.read_csv('test.csv', index_col='id')

# feature lists for preprocessing and training
missing_list = [feature for feature in test_data.columns if feature == 'loading' or feature.startswith('measurement')]
aggregate_list = [feature for feature in test_data.columns if test_data[feature].dtypes == 'float64' and feature != 'loading']
chosen_features = ['loading', 'measurement_0', 'measurement_1', 'measurement_2', 'measurement_17', 'attribute_0', 'm3_missing', 'm5_missing', 'area', 'm3_17_avg', 'm3_17_stdev']

In [4]:
# preprocess data features
    
# add indicators for missing values in measurement 3 & 5
test_data['m3_missing'] = test_data.measurement_3.isna()
test_data['m5_missing'] = test_data.measurement_5.isna()

# fill in missing values
imputer = KNNImputer(n_neighbors=15)
imputer.fit(test_data[missing_list])
test_data[missing_list] = imputer.transform(test_data[missing_list])

# label encode string features: product_code, attribute_0, attribute_1
test_data['product_code'] = LabelEncoder().fit_transform(test_data['product_code'])
test_data['attribute_0'] = LabelEncoder().fit_transform(test_data['attribute_0'])
test_data['attribute_1'] = LabelEncoder().fit_transform(test_data['attribute_1'])

# clip measurement_2 values that are below 11
test_data['measurement_2'] = test_data['measurement_2'].clip(11, None)

# add area feature by multiplying attribute_2 and attribute_3
test_data['area'] = test_data['attribute_2'] * test_data['attribute_3']

# add average and stdev of features, measurement_3 ~ measurement_17
test_data['m3_17_avg'] = test_data[aggregate_list].mean(axis=1)
test_data['m3_17_stdev'] = test_data[aggregate_list].std(axis=1)

# standardize all features
test_data[chosen_features] = StandardScaler().fit_transform(test_data[chosen_features])

In [5]:
# fit model & make predictions
model = LogisticRegression(penalty='l1', C=0.01, solver='liblinear', random_state=1)
model = load('weights.joblib') 
predictions = model.predict_proba(test_data[chosen_features])[:,1]

In [6]:
# generate submission.csv
submission = pd.DataFrame({'id': test_data.index, 'failure': predictions})
submission.to_csv('submission.csv', index=False)