## Imports

In [39]:
import pandas as pd
import pickle
from scipy.stats import uniform

from sklearn import set_config; set_config(display='diagram')

from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

# Tuning Pipeline

👇 Consider the following dataset.

In [2]:
data = pd.read_csv("https://wagon-public-datasets.s3.amazonaws.com/05-Machine-Learning/08-Workflow/tuning_pipeline_data.csv")
data.head()

Unnamed: 0,games played,minutes played,points per game,field goals made,field goal attempts,field goal percent,3 point made,3 point attempt,3 point %,free throw made,free throw attempts,free throw %,offensive rebounds,defensive rebounds,rebounds,assists,steals,blocks,turnovers,target_5y
0,36.0,27.4,7.4,2.6,7.6,,0.5,2.1,25.0,1.6,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,35.0,26.9,,2.0,6.7,29.6,0.7,2.8,23.5,2.6,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,0.9,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,58.0,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,0.9,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,48.0,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,1.3,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1


- Each observation represents a player
- Each column represents a characteristic of a player's performance

The target defines whether the player lasted less than 5 years (`0`) vs. 5 years or more (`1`) as a professional.

In [3]:
X = data.drop(columns="target_5y")
y = data['target_5y']

## Pipeline

👇 We are giving you the simple pipeline below

In [7]:
# Preprocessing pipe
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaling', MinMaxScaler())
])
preprocessor.fit_transform(X)
# Final pipe
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model_svm', SVC())
])

pipe.fit(X,y)

## Fine-Tuning

Our task is to assist in the recruitment process of promising young players.  
The model should **limit false alarms as much as possible** to avoid recruiting players that will flop.

❓ **Fine-tune this pipeline to maximize your objective**

- Use the `scoring` metric appropriate for the task
- Grid Search for the optimal
    - imputing `strategy`
    - `kernel`
    - regularization factor `C`
- Store your random search results in a `search` variable

In [9]:
set_config(display='text')

pipe.get_params()

{'memory': None,
 'steps': [('preprocessing',
   Pipeline(steps=[('imputer', SimpleImputer()), ('scaling', MinMaxScaler())])),
  ('model_svm', SVC())],
 'verbose': False,
 'preprocessing': Pipeline(steps=[('imputer', SimpleImputer()), ('scaling', MinMaxScaler())]),
 'model_svm': SVC(),
 'preprocessing__memory': None,
 'preprocessing__steps': [('imputer', SimpleImputer()),
  ('scaling', MinMaxScaler())],
 'preprocessing__verbose': False,
 'preprocessing__imputer': SimpleImputer(),
 'preprocessing__scaling': MinMaxScaler(),
 'preprocessing__imputer__add_indicator': False,
 'preprocessing__imputer__copy': True,
 'preprocessing__imputer__fill_value': None,
 'preprocessing__imputer__keep_empty_features': False,
 'preprocessing__imputer__missing_values': nan,
 'preprocessing__imputer__strategy': 'mean',
 'preprocessing__scaling__clip': False,
 'preprocessing__scaling__copy': True,
 'preprocessing__scaling__feature_range': (0, 1),
 'model_svm__C': 1.0,
 'model_svm__break_ties': False,
 'model

In [35]:
grid_search = GridSearchCV(
    pipe,
    param_grid={
        'preprocessing__imputer__strategy': ['mean', 'median'],
        'model_svm__kernel': ['rbf', 'poly', 'linear'],
        'model_svm__C' : [1.0, 2.0, 3.0, 4.0, 5.0]
    },
    cv=5,
    scoring="precision")
grid_search.fit(X, y)

In [36]:
search = grid_search.fit(X, y)
#search = grid_search.best_params_
search.cv_results_

{'mean_fit_time': array([0.01909842, 0.01916666, 0.0205596 , 0.02137265, 0.01367874,
        0.01467772, 0.01859407, 0.01959863, 0.02574587, 0.02624102,
        0.01440482, 0.01521869, 0.01886716, 0.01994224, 0.02971444,
        0.02963796, 0.01465683, 0.01565504, 0.01929379, 0.02009726,
        0.03277378, 0.03416657, 0.01463304, 0.01560903, 0.01977544,
        0.02041411, 0.03916998, 0.03618259, 0.01484413, 0.015763  ]),
 'std_fit_time': array([1.34601261e-03, 5.22104254e-04, 4.39672107e-04, 8.01249578e-04,
        2.79683688e-04, 5.57996228e-04, 3.11145486e-04, 3.09806045e-04,
        1.18289982e-03, 1.52286722e-03, 3.68783262e-04, 4.08105070e-04,
        3.14045133e-04, 3.67951275e-04, 2.07535609e-03, 1.27296543e-03,
        6.78824883e-05, 1.22820979e-04, 2.65894935e-04, 3.13176322e-04,
        2.01495378e-03, 2.72627402e-03, 1.21409389e-04, 1.18723835e-04,
        3.88056261e-04, 1.99678371e-04, 5.04228139e-03, 2.34391073e-03,
        2.02883166e-04, 1.99100293e-04]),
 'mean_scor

In [38]:
search.best_params_

{'model_svm__C': 1.0,
 'model_svm__kernel': 'poly',
 'preprocessing__imputer__strategy': 'median'}

In [37]:
from nbresult import ChallengeResult

result = ChallengeResult(
    'solution',
    scoring = search.scoring,
    cv = search.cv,
    mean_test_score = search.cv_results_['mean_test_score']
)

result.write()
print(result.check())


platform darwin -- Python 3.10.6, pytest-7.1.3, pluggy-1.0.0 -- /Users/victoriadorosenco/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /Users/victoriadorosenco/code/victometer/05-ML/08-Workflow/data-tuning-pipeline/tests
plugins: asyncio-0.19.0, typeguard-2.13.3, anyio-3.6.2
asyncio: mode=strict
[1mcollecting ... [0mcollected 1 item

test_solution.py::TestSolution::test_cv_results [32mPASSED[0m[32m                   [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/solution.pickle

[32mgit[39m commit -m [33m'Completed solution step'[39m

[32mgit[39m push origin master



## Export

Once you have built your optimal pipeline, export it as a pickle file

In [41]:
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaling', MinMaxScaler())
])
preprocessor.fit_transform(X)
# Final pipe
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model_svm', SVC(kernel='poly', C=1.0))
])

pipe.fit(X,y)

In [42]:
# YOUR CODE HERE
# pipe.fit()
with open("pipe.pkl", "wb") as file:
    pickle.dump(pipeline_tuned, file)

NameError: name 'pipeline_tuned' is not defined

🏁 Congratulation. Don't forget to add, commit and push your notebook.