In [1]:
import numpy
import pandas
import joblib

numpy.set_printoptions(threshold=10)

# Instructions

- Read **the train data** from the CSV file and properly set the index
- Use `joblib` to load the trained model and print out the model parameters
- Again, print out thoses model parameters

In [2]:
data_train = pandas.read_csv('./data/features.train.csv').set_index('id')
data_train

Unnamed: 0_level_0,feature_1,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
253,1.4920,1.0676
667,-0.9317,0.8359
85,-1.4077,-1.6642
969,2.2750,0.7641
75,-2.7649,-0.0689
...,...,...
835,-0.1237,-0.8241
192,-1.8828,-1.0807
629,-2.1139,-2.9819
559,2.3218,1.2918


In [3]:
model = joblib.load('model/model.joblib')
model

In [4]:
print(f"model.coef_     : {model.named_steps['linear_regression'].coef_}")
print(f"model.intercept_: {model.named_steps['linear_regression'].intercept_}")

model.coef_     : [ 5.76221252e-01  7.09304861e-01 -1.94038979e+00 ...  4.58183765e-03
  1.34040223e-04 -1.35736586e-04]
model.intercept_: 0.012684897234401957


# Instructions

- Use `.predict` to produce predictions for all training samples
- Properly put the predictions into `pandas.DataFrame`, with a column named `prediction`


In [5]:
predictions_from_sklearn = model.predict(data_train[['feature_1']])
predictions_from_sklearn

array([ 0.6279037 , -0.0630131 , -0.5699558 , ..., -1.01171359,
        1.2799529 , -1.70838039])

In [6]:
predictions_from_sklearn = pandas.DataFrame(
    data    = predictions_from_sklearn,
    index   = data_train.index,
    columns = ['prediction']
)
predictions_from_sklearn



Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
253,0.627904
667,-0.063013
85,-0.569956
969,1.169160
75,-1.643021
...,...
835,-0.044471
192,-0.563736
629,-1.011714
559,1.279953


# Instructions
- Now, **manually calculate** the predictions for all training samples
  - Hint: add columns `feature_1^2`, `feature_1^3`, ..., `feature_1^15`
  - Hint: use `.coef_`, `.dot`,
- Also, properly put the data into `pandas.DataFrame`, with a column named `prediction`
- Verify that the predictions from `.predict` and manual calculations are equal
  - Hint: use `numpy.allclose`



In [7]:
for degree in range(2,16):
    data_train[f'feature_1^{degree}'] = data_train['feature_1'] ** degree
data_train

Unnamed: 0_level_0,feature_1,label,feature_1^2,feature_1^3,feature_1^4,feature_1^5,feature_1^6,feature_1^7,feature_1^8,feature_1^9,feature_1^10,feature_1^11,feature_1^12,feature_1^13,feature_1^14,feature_1^15
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
253,1.4920,1.0676,2.226064,3.321287,4.955361,7.393399,11.030951,1.645818e+01,2.455560e+01,3.663696e+01,5.466234e+01,8.155621e+01,1.216819e+02,1.815494e+02,2.708716e+02,4.041405e+02
667,-0.9317,0.8359,0.868065,-0.808776,0.753537,-0.702070,0.654119,-6.094424e-01,5.678175e-01,-5.290356e-01,4.929024e-01,-4.592372e-01,4.278713e-01,-3.986477e-01,3.714200e-01,-3.460521e-01
85,-1.4077,-1.6642,1.981619,-2.789525,3.926815,-5.527777,7.781452,-1.095395e+01,1.541988e+01,-2.170656e+01,3.055632e+01,-4.301414e+01,6.055100e+01,-8.523764e+01,1.199890e+02,-1.689086e+02
969,2.2750,0.7641,5.175625,11.774547,26.787094,60.940639,138.639954,3.154059e+02,7.175484e+02,1.632423e+03,3.713762e+03,8.448807e+03,1.922104e+04,4.372786e+04,9.948088e+04,2.263190e+05
75,-2.7649,-0.0689,7.644672,-21.136754,58.441010,-161.583549,446.762354,-1.235253e+03,3.415352e+03,-9.443106e+03,2.610924e+04,-7.218945e+04,1.995966e+05,-5.518646e+05,1.525851e+06,-4.218824e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,-0.1237,-0.8241,0.015302,-0.001893,0.000234,-0.000029,0.000004,-4.431879e-07,5.482234e-08,-6.781524e-09,8.388745e-10,-1.037688e-10,1.283620e-11,-1.587838e-12,1.964155e-13,-2.429660e-14
192,-1.8828,-1.0807,3.544936,-6.674405,12.566570,-23.660338,44.547685,-8.387438e+01,1.579187e+02,-2.973293e+02,5.598116e+02,-1.054013e+03,1.984496e+03,-3.736409e+03,7.034912e+03,-1.324533e+04
629,-2.1139,-2.9819,4.468573,-9.446117,19.968147,-42.210665,89.229125,-1.886214e+02,3.987269e+02,-8.428687e+02,1.781740e+03,-3.766421e+03,7.961837e+03,-1.683053e+04,3.557805e+04,-7.520844e+04
559,2.3218,1.2918,5.390755,12.516256,29.060242,67.472070,156.656652,3.637254e+02,8.444977e+02,1.960755e+03,4.552480e+03,1.056995e+04,2.454131e+04,5.698001e+04,1.322962e+05,3.071653e+05


In [8]:
predictions_from_calculation = pandas.DataFrame(
    data    = (
        data_train.drop(["label"], axis='columns')
        .dot(model.named_steps['linear_regression'].coef_.T) 
        + model.named_steps['linear_regression'].intercept_
    ),
    index   = data_train.index,
    columns = ['prediction']
)

predictions_from_calculation


Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
253,0.627904
667,-0.063013
85,-0.569956
969,1.169160
75,-1.643021
...,...
835,-0.044471
192,-0.563736
629,-1.011714
559,1.279953


In [9]:
numpy.allclose(predictions_from_sklearn, predictions_from_calculation)

True