In [61]:
import pandas as pd
import numpy as np
# run the code in the same folder as the preprocessed test & training data
train_data_withid = pd.read_csv(r"70000data_with_na_no_str.csv")
train_data = train_data_withid.drop(columns=['unique_id'],inplace=False)

# split the responses vector from training data
label_values = train_data["nEvent"]
train_data.drop(columns=["nEvent"],inplace=True)

# get all column names
col_names = train_data.columns

test_data_withid = pd.read_csv(r"test_data_with_na_no_str.csv")

# drop the column with lowest correlation with nEvent
test_data_withid.drop(columns=["eatSpicy"],inplace=True)
test_data = test_data_withid.drop(columns=['unique_id'],inplace=False)


In [9]:
# knn imputation transform for both trainig and test dataset. It is used to automatically fill the NaN values existing
from sklearn.impute import KNNImputer

# define imputer
imputer = KNNImputer(n_neighbors=10, weights='uniform', metric='nan_euclidean')
# fit on the training dataset
imputer.fit(train_data)
# transform the training dataset
train_trans = imputer.transform(train_data)

In [39]:
# define imputer
imputer = KNNImputer(n_neighbors=10, weights='uniform', metric='nan_euclidean')
# fit on the testing dataset
imputer.fit(test_data)

# fit on the testing dataset
test_trans = imputer.transform(test_data)

In [43]:
# import logistic linear regresion model from sklearn
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=0,solver='liblinear',max_iter=200).fit(train_trans,label_values)


In [48]:
# make probablity prediction, the first column is the probability of not having a claim, the second column is the probablity of having a claim
prob_pred = model.predict_proba(test_trans)
prob_pred

array([[0.99584487, 0.00415513],
       [0.96360964, 0.03639036],
       [0.97716647, 0.02283353],
       ...,
       [0.98313388, 0.01686612],
       [0.98946384, 0.01053616],
       [0.99090039, 0.00909961]])

In [66]:
pred_vector = prob_pred[:,1:]

In [50]:
# matching unique id with the probability of having a claim
unique_id = test_data_withid["unique_id"].to_numpy()

unique_id = unique_id[:,None]
result = np.concatenate((unique_id,pred_vector),axis=1)

In [52]:
# change the np array back to dataframe
result_df = pd.DataFrame(result,columns=["unique_id","prob"])

In [53]:
result_df

Unnamed: 0,unique_id,prob
0,14800.0,0.004155
1,278323.0,0.036390
2,250032.0,0.022834
3,64646.0,0.019106
4,106064.0,0.053520
...,...,...
29995,62939.0,0.053384
29996,357914.0,0.013951
29997,381410.0,0.016866
29998,346498.0,0.010536


In [54]:
# output the result csv file
compression_opts = dict(method='zip',
                        archive_name='out.csv')  
result_df.to_csv('out.zip', index=False,
          compression=compression_opts) 