In [1]:
import numpy
import pandas
import joblib

# Instructions

- Read **the train data** from the CSV file and properly set the index
- Use `joblib` to load the trained model and print out the model parameters

In [2]:
data_train = pandas.read_csv('./data/features.train.csv').set_index('id')
data_train

Unnamed: 0_level_0,feature_1,feature_2,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
253,2.2189,0.3114,1
667,-0.2929,0.3086,1
85,0.6741,-0.4251,1
969,0.9695,-0.5618,1
75,-1.5488,0.6378,0
...,...,...,...
835,0.6986,-0.2398,1
192,-0.4636,0.5754,0
629,0.6531,0.2976,0
559,1.1244,1.4332,0


In [3]:
model = joblib.load('model/model.joblib')
model

# Instructions
- Manually calculate the prediction probability and the class that the input `sample` with a `feature_1 = 0.5` and a `feature_2 = 0` belong
  - Hint: define a function `euclidean_distance(x, y)` 
- Verify that the calculation are equal to `.predict` and `.predict_proba`.


In [4]:
selected_data_point = numpy.array([1., 0.])

neighbors = data_train.loc[(
    (data_train[['feature_1', 'feature_2']] - selected_data_point)
    .apply(numpy.square)
    .sum(axis='columns')
    .apply(numpy.sqrt)
    .sort_values(axis='index', ascending=True)
    .iloc[:5]
).index]

neighbors

Unnamed: 0_level_0,feature_1,feature_2,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
838,1.0008,0.0344,0
61,1.0548,0.0661,1
465,0.9724,-0.0864,0
43,0.9078,0.0356,1
727,1.116,0.001,0


In [5]:
neighbors['label'].mean()

0.4

In [6]:
sample = pandas.DataFrame({
    'feature_1': [1.], 
    'feature_2': [0.],
})
sample

Unnamed: 0,feature_1,feature_2
0,1.0,0.0


In [7]:
probabilities_from_sklearn = model.predict_proba(sample)
probabilities_from_sklearn[:,1]

array([0.4])

In [8]:
prediction_from_sklearn = model.predict(sample)
prediction_from_sklearn

array([0])

# Instructions
- Use `.predict_proba` to calculate prediction probabilities for all training samples
- Properly put the prediction probabilities $\mathrm{P}(y=1| \mathbf{x})$ into `pandas.DataFrame`, with a column named `probability`
- The prediction probabilities $\mathrm{P}(y=0| \mathbf{x})$ can be ignored

In [9]:
probabilities_from_sklearn = model.predict_proba(data_train[['feature_1', 'feature_2']])
probabilities_from_sklearn


array([[0. , 1. ],
       [0. , 1. ],
       [0.2, 0.8],
       ...,
       [1. , 0. ],
       [1. , 0. ],
       [0. , 1. ]])

In [10]:
probabilities_from_sklearn = pandas.DataFrame(
    data = probabilities_from_sklearn[:, 1],
    index = data_train.index,
    columns = ['probability']
)
probabilities_from_sklearn

Unnamed: 0_level_0,probability
id,Unnamed: 1_level_1
253,1.0
667,1.0
85,0.8
969,1.0
75,0.0
...,...
835,0.8
192,0.2
629,0.0
559,0.0


# Instructions
- Use `.predict` to produce predictions for all training samples
- Properly put the predictions into `pandas.DataFrame`, with a column named `prediction`

In [11]:
predictions_from_sklearn = model.predict(data_train[['feature_1', 'feature_2']])
predictions_from_sklearn

array([1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,

In [12]:
predictions_from_sklearn = pandas.DataFrame(
    data    = predictions_from_sklearn,
    index   = data_train.index,
    columns = ['prediction']
)
predictions_from_sklearn

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
253,1
667,1
85,1
969,1
75,0
...,...
835,1
192,0
629,0
559,0
