In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
from sklearn.metrics import precision_recall_curve, roc_curve, auc, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

train_x = pd.read_csv("train_x.csv")
train_y_data = pd.read_csv("train_y.csv")
test = pd.read_csv("public_private_X.csv")

train_x.columns

Index(['ID', 'DIVISION_NUMBER', 'PRODUCT_NUMBER', 'PURCHASE_ORDER_DUE_DATE',
       'COMPANY_VENDOR_NUMBER', 'SHIP_FROM_VENDOR', 'ORDER_DATE',
       'ORDER_DAY_OF_WEEK', 'PRODUCT_CLASSIFICATION', 'PURCHASE_ORDER_TYPE',
       'DISTANCE_IN_MILES', 'DIVISION_CODE', 'PURCHASE_FROM_VENDOR',
       'AVERAGE_PRODUCT_ORDER_QUANTITY_MARKET', 'ORDER_QUANTITY_DEVIATION',
       'TRANSIT_LEAD_TIME', 'PURCHASING_LEAD_TIME',
       'DAYS_BETWEEN_ORDER_AND_DUE_DATE', 'GIVEN_TIME_TO_LEAD_TIME_RATIO',
       'DUE_DATE_WEEKDAY', 'PRODUCT_MARKET', 'RESERVABLE_INDICATOR',
       'PRODUCT_STATUS', 'AVERAGE_DAILY_DEMAND_CASES',
       'AVERAGE_VENDOR_ORDER_CYCLE_DAYS', 'AVERAGE_ORDER_CYCLE_DAYS',
       'AVERAGE_ORDER_CYCLE_CASES', 'LEAD_TIME_TO_DISTANCE_RATIO'],
      dtype='object')

In [2]:
# merge train_x and train_y on ID key
train = pd.merge(train_x, train_y_data, on='ID')
len(train)

20413

In [3]:
# print nancounts
train.isna().sum()

ID                                         0
DIVISION_NUMBER                            0
PRODUCT_NUMBER                             0
PURCHASE_ORDER_DUE_DATE                    0
COMPANY_VENDOR_NUMBER                      0
SHIP_FROM_VENDOR                           0
ORDER_DATE                                 0
ORDER_DAY_OF_WEEK                          0
PRODUCT_CLASSIFICATION                     0
PURCHASE_ORDER_TYPE                        0
DISTANCE_IN_MILES                          0
DIVISION_CODE                              0
PURCHASE_FROM_VENDOR                       0
AVERAGE_PRODUCT_ORDER_QUANTITY_MARKET      0
ORDER_QUANTITY_DEVIATION                   0
TRANSIT_LEAD_TIME                          0
PURCHASING_LEAD_TIME                       0
DAYS_BETWEEN_ORDER_AND_DUE_DATE            0
GIVEN_TIME_TO_LEAD_TIME_RATIO              0
DUE_DATE_WEEKDAY                           0
PRODUCT_MARKET                             0
RESERVABLE_INDICATOR                       0
PRODUCT_ST

In [4]:
# impute missing values with median
train["AVERAGE_DAILY_DEMAND_CASES"] = train["AVERAGE_DAILY_DEMAND_CASES"].fillna(train["AVERAGE_DAILY_DEMAND_CASES"].median())
train["AVERAGE_VENDOR_ORDER_CYCLE_DAYS"] = train["AVERAGE_VENDOR_ORDER_CYCLE_DAYS"].fillna(train["AVERAGE_VENDOR_ORDER_CYCLE_DAYS"].median())
train["AVERAGE_ORDER_CYCLE_DAYS"] = train["AVERAGE_ORDER_CYCLE_DAYS"].fillna(train["AVERAGE_ORDER_CYCLE_DAYS"].median())
train["AVERAGE_ORDER_CYCLE_CASES"] = train["AVERAGE_ORDER_CYCLE_CASES"].fillna(train["AVERAGE_ORDER_CYCLE_CASES"].median())

In [5]:
x_train_data = train.drop(columns = ["ON_TIME_AND_COMPLETE"])

# drop product identifier columns except ID
x_train_data.drop(columns=['ID', 'DIVISION_NUMBER', 'PRODUCT_NUMBER'], inplace=True)

# drop columns with 0 variance
x_train_data.drop(columns=["RESERVABLE_INDICATOR", "PRODUCT_STATUS"], inplace=True)

# drop non-numeric columns
x_train_data.drop(columns=["DIVISION_CODE"], inplace=True)

# convert date columnns to datetime
x_train_data["PURCHASE_ORDER_DUE_DATE"] = pd.to_datetime(x_train_data["PURCHASE_ORDER_DUE_DATE"])
x_train_data["ORDER_DATE"] = pd.to_datetime(x_train_data["ORDER_DATE"])

# convert datetime columns to numeric
x_train_data["PURCHASE_ORDER_DUE_DATE"] = x_train_data["PURCHASE_ORDER_DUE_DATE"].astype(int) / 10**9
x_train_data["ORDER_DATE"] = x_train_data["ORDER_DATE"].astype(int) / 10**9

x_train_data.head()

Unnamed: 0,PURCHASE_ORDER_DUE_DATE,COMPANY_VENDOR_NUMBER,SHIP_FROM_VENDOR,ORDER_DATE,ORDER_DAY_OF_WEEK,PRODUCT_CLASSIFICATION,PURCHASE_ORDER_TYPE,DISTANCE_IN_MILES,PURCHASE_FROM_VENDOR,AVERAGE_PRODUCT_ORDER_QUANTITY_MARKET,...,PURCHASING_LEAD_TIME,DAYS_BETWEEN_ORDER_AND_DUE_DATE,GIVEN_TIME_TO_LEAD_TIME_RATIO,DUE_DATE_WEEKDAY,PRODUCT_MARKET,AVERAGE_DAILY_DEMAND_CASES,AVERAGE_VENDOR_ORDER_CYCLE_DAYS,AVERAGE_ORDER_CYCLE_DAYS,AVERAGE_ORDER_CYCLE_CASES,LEAD_TIME_TO_DISTANCE_RATIO
0,1406765000.0,1295,1371,1406333000.0,0,36,0,339.02,2196,5.13,...,6.0,5,0.71,6,5327,1.48,7.0,7.020016,9.313729,0.021
1,1409616000.0,1295,1371,1408493000.0,5,36,0,339.02,1481,61.65,...,14.0,13,0.65,1,6135,8.49,7.0,7.0,52.623575,0.059
2,1404778000.0,1632,29,1403914000.0,0,9,1,2112.66,2399,20.0,...,8.0,10,1.25,1,3625,0.87,22.343,32.055321,23.822102,0.004
3,1406851000.0,365,407,1406160000.0,6,26,1,35.92,254,61.42,...,8.0,8,1.0,4,1709,2.93,10.511,16.152909,62.79,0.223
4,1410394000.0,41,42,1409702000.0,5,0,0,257.34,1895,162.43,...,15.0,8,0.5,6,193,13.04,9.146,9.146893,161.962938,0.062


In [6]:
# Drop the same columns in test data
test.drop(columns=['ID', 'DIVISION_NUMBER', 'PRODUCT_NUMBER'], inplace=True)
test.drop(columns=["RESERVABLE_INDICATOR", "PRODUCT_STATUS"], inplace=True)
test.drop(columns=["DIVISION_CODE"], inplace=True)
test["PURCHASE_ORDER_DUE_DATE"] = pd.to_datetime(test["PURCHASE_ORDER_DUE_DATE"])
test["ORDER_DATE"] = pd.to_datetime(test["ORDER_DATE"])

test["PURCHASE_ORDER_DUE_DATE"] = test["PURCHASE_ORDER_DUE_DATE"].astype(int) / 10**9
test["ORDER_DATE"] = test["ORDER_DATE"].astype(int) / 10**9

In [7]:
# impute na values with median in test
test["AVERAGE_DAILY_DEMAND_CASES"] = test["AVERAGE_DAILY_DEMAND_CASES"].fillna(test["AVERAGE_DAILY_DEMAND_CASES"].median())
test["AVERAGE_VENDOR_ORDER_CYCLE_DAYS"] = test["AVERAGE_VENDOR_ORDER_CYCLE_DAYS"].fillna(test["AVERAGE_VENDOR_ORDER_CYCLE_DAYS"].median())
test["AVERAGE_ORDER_CYCLE_DAYS"] = test["AVERAGE_ORDER_CYCLE_DAYS"].fillna(test["AVERAGE_ORDER_CYCLE_DAYS"].median())
test["AVERAGE_ORDER_CYCLE_CASES"] = test["AVERAGE_ORDER_CYCLE_CASES"].fillna(test["AVERAGE_ORDER_CYCLE_CASES"].median())

In [8]:
train_y = np.array(train_y_data["ON_TIME_AND_COMPLETE"])


In [9]:
# scale the predictors
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_data)
x_test_scaled = scaler.transform(test)

In [11]:
# train model
model = LogisticRegression(penalty=None, max_iter=1000)
model.fit(x_train_scaled, train_y)

y_pred_test = model.predict(x_test_scaled)
y_pred_probs = model.predict_proba(x_test_scaled)[:, 1]

# Print predictions
print("Predicted labels:", y_pred_test)
print("Predicted probabilities:", y_pred_probs)

Predicted labels: [1 0 0 ... 1 0 1]
Predicted probabilities: [0.75673636 0.36282668 0.41221916 ... 0.80857366 0.29183208 0.52397767]


In [15]:
# create submission file
original_test = pd.read_csv("public_private_X.csv")
submission = pd.DataFrame({"ID": original_test["ID"], "ON_TIME_AND_COMPLETE": y_pred_test})
submission.to_csv("first_submission.csv", index=False)