In [1]:
dbutils.library.installPyPI("mlflow", "1.0.0")

In [2]:
import mlflow.sklearn
import tempfile

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score

## Load Data

In [4]:
#w q1 train sets
pd_train_w_q1_wo_resamp = spark.read.csv('/mnt/xql2001-gr5069/processed/final_project/modelsets/spark_pred_train_w_q1_wo_resamp.csv',header=True).toPandas()
pd_train_w_q1_undersamp = spark.read.csv('/mnt/xql2001-gr5069/processed/final_project/modelsets/spark_pred_train_w_q1_undersamp.csv',header=True).toPandas()
pd_train_w_q1_oversamp = spark.read.csv('/mnt/xql2001-gr5069/processed/final_project/modelsets/spark_pred_train_w_q1_oversamp.csv',header=True).toPandas()
pd_train_w_q1_smote = spark.read.csv('/mnt/xql2001-gr5069/processed/final_project/modelsets/spark_pred_train_w_q1_smote.csv',header=True).toPandas()

#w/o q1 train sets
pd_train_wo_q1_wo_resamp = spark.read.csv('/mnt/xql2001-gr5069/processed/final_project/modelsets/spark_pred_train_wo_q1_wo_resamp.csv',header=True).toPandas()
pd_train_wo_q1_undersamp = spark.read.csv('/mnt/xql2001-gr5069/processed/final_project/modelsets/spark_pred_train_wo_q1_undersamp.csv',header=True).toPandas()
pd_train_wo_q1_oversamp = spark.read.csv('/mnt/xql2001-gr5069/processed/final_project/modelsets/spark_pred_train_wo_q1_oversamp.csv',header=True).toPandas()
pd_train_wo_q1_smote = spark.read.csv('/mnt/xql2001-gr5069/processed/final_project/modelsets/spark_pred_train_wo_q1_smote.csv',header=True).toPandas()

#test sets
pd_test_w_q1 = spark.read.csv('/mnt/xql2001-gr5069/processed/final_project/modelsets/spark_pred_test_w_q1.csv',header=True).toPandas()
pd_test_wo_q1 = spark.read.csv('/mnt/xql2001-gr5069/processed/final_project/modelsets/spark_pred_test_wo_q1.csv',header=True).toPandas()

In [5]:
# test set dfs to work with
pd_test_w_q1_working = pd_test_w_q1.copy(deep = True)
pd_test_w_q1_working['Second'] = pd_test_w_q1_working['Second'].astype(int)
pd_test_wo_q1_working = pd_test_wo_q1.copy(deep = True)
pd_test_wo_q1_working['Second'] = pd_test_wo_q1_working['Second'].astype(int)

# Test sets to give sklearn
pd_test_w_q1_4_predict = pd_test_w_q1.drop(['raceId', 'driverId', 'constructorId', 'Second'], axis = 1)
pd_test_wo_q1_4_predict = pd_test_wo_q1.drop(['raceId', 'driverId', 'constructorId', 'Second'], axis = 1)

## ML Flow Setup

In [7]:
def log_rf(run_name, params, train, test_4_model, test_w_Ids):
  # :::::::::::: DESCRIPTION ::::::::::::
  # This function is used to fit a Random Forest model to the F1 data
  # and generate  predicted second place positions based on the most 
  # likely second place finish for each race.
  #
  # Model parameters and feature importances, are also logged in MLFlow
  #
  # ::::::::: INPUTS ::::::::::::
  # 1. run_name - A string, the name of the run registered in MLFlow
  # 2. Params - A dictionary of the parameters to pass to the model in 
  # the form {'parameter': value}
  # 3. train - The training data, to pass to model.fit() for an Sklearn 
  # model object
  # 4. test_4_model - The test data, to pass to model.predict_proba()
  # for an Sklearn model object. Should only contain features, without 
  # reference Ids.
  # 5. test_w_Ids - The test data with Ids, to group predicted probabilities
  # by raceId and compute the most likely second place finish in each race.
  #
  # ::::::::: OUTPUT ::::::::::::
  # Prints Precision Score
  # Model, parameters, and feature importance logged in MLFlow
  
  with mlflow.start_run(run_name = run_name) as run:
    runID = run.info.run_uuid
    experimentID = run.info.experiment_id
    
    # Create model, train it
    model = RandomForestClassifier(**params)
    model.fit(train.drop(['Second'], axis = 1),
              train[['Second']].values.ravel())
    
    # Log model
    # mlflow.sklearn.log_model(model, "random-forest-model")
    
    # Log params
    [mlflow.log_param(param, value) for param, value in params.items()]
    
    # Prediction
    test_w_Ids['predict_proba'] = model.predict_proba(test_4_model)[:, 1]
    max_prob_per_race = pd.DataFrame(test_w_Ids.groupby(['raceId']).predict_proba.max())
    test_w_Ids = test_w_Ids\
      .merge(max_prob_per_race, on = 'raceId', how = 'inner', suffixes = ('_driver', '_max'))
    
    # Classify driver with max probability for that race as 1
    test_w_Ids['pred'] = test_w_Ids['predict_proba_driver']\
      .eq(test_w_Ids['predict_proba_max']).astype(int)

    # Create metrics
    precision = precision_score(test_w_Ids['Second'], test_w_Ids['pred'])
    
    # Print metrics
    print("precision: {}".format(precision))
    
    # Log metrics
    mlflow.log_metric("precision", precision)
    
    # Create feature importance
    importance = pd.DataFrame(list(zip(train.drop(['Second'], axis = 1).columns,
                                       model.feature_importances_)), 
                                columns=["Feature", "Importance"]
                              ).sort_values("Importance", ascending=False)
    
    # Log importances using a temporary file
    temp = tempfile.NamedTemporaryFile(prefix="feature-importance-", suffix=".csv")
    temp_name = temp.name
    try:
      importance.to_csv(temp_name, index=False)
      mlflow.log_artifact(temp_name, "feature-importance.csv")
    finally:
      temp.close() # Delete the temp file

    return run.info.run_uuid

In [8]:
def log_gb(run_name, params, train, test_4_model, test_w_Ids):
  # :::::::::::: DESCRIPTION ::::::::::::
  # This function is used to fit a Random Forest model to the F1 data
  # and generate  predicted second place positions based on the most 
  # likely second place finish for each race.
  #
  # Model parameters and feature importances, are also logged in MLFlow
  #
  # ::::::::: INPUTS ::::::::::::
  # 1. run_name - A string, the name of the run registered in MLFlow
  # 2. Params - A dictionary of the parameters to pass to the model in 
  # the form {'parameter': value}
  # 3. train - The training data, to pass to model.fit() for an Sklearn 
  # model object
  # 4. test_4_model - The test data, to pass to model.predict_proba()
  # for an Sklearn model object. Should only contain features, without 
  # reference Ids.
  # 5. test_w_Ids - The test data with Ids, to group predicted probabilities
  # by raceId and compute the most likely second place finish in each race.
  #
  # ::::::::: OUTPUT ::::::::::::
  # Prints Precision Score
  # Model, parameters, and feature importance logged in MLFlow
  
  with mlflow.start_run(run_name = run_name) as run:
    runID = run.info.run_uuid
    experimentID = run.info.experiment_id
    
    # Create model, train it
    model = GradientBoostingClassifier(**params)
    model.fit(train.drop(['Second'], axis = 1),
              train[['Second']].values.ravel())
    
    # Log model
    # mlflow.sklearn.log_model(model, "random-forest-model")
    
    # Log params
    [mlflow.log_param(param, value) for param, value in params.items()]
    
    # Prediction
    test_w_Ids['predict_proba'] = model.predict_proba(test_4_model)[:, 1]
    max_prob_per_race = pd.DataFrame(test_w_Ids.groupby(['raceId']).predict_proba.max())
    test_w_Ids = test_w_Ids\
      .merge(max_prob_per_race, on = 'raceId', how = 'inner', suffixes = ('_driver', '_max'))
    
    # Classify driver with max probability for that race as 1
    test_w_Ids['pred'] = test_w_Ids['predict_proba_driver']\
      .eq(test_w_Ids['predict_proba_max']).astype(int)

    # Create metrics
    precision = precision_score(test_w_Ids['Second'], test_w_Ids['pred'])
    
    # Print metrics
    print("precision: {}".format(precision))
    
    # Log metrics
    mlflow.log_metric("precision", precision)
    
    # Create feature importance
    importance = pd.DataFrame(list(zip(train.drop(['Second'], axis = 1).columns,
                                       model.feature_importances_)), 
                                columns=["Feature", "Importance"]
                              ).sort_values("Importance", ascending=False)
    
    # Log importances using a temporary file
    temp = tempfile.NamedTemporaryFile(prefix="feature-importance-", suffix=".csv")
    temp_name = temp.name
    try:
      importance.to_csv(temp_name, index=False)
      mlflow.log_artifact(temp_name, "feature-importance.csv")
    finally:
      temp.close() # Delete the temp file

    return run.info.run_uuid

### Testing MLFlow implementation

In [10]:
params_test_rf = {'n_estimators': 1000,
                  'max_depth': 5,
                  'max_features': 'sqrt',
                  'random_state': 123
                 }

log_rf('test run rf', params_test_rf, pd_train_w_q1_wo_resamp, pd_test_w_q1_4_predict, pd_test_w_q1_working)

In [11]:
params_test_gb = {'loss': 'deviance',
                  'learning_rate': 0.1,
                  'n_estimators': 1000,
                  'max_depth': 5,
                  'max_features': 'sqrt',
                  'random_state': 123
                 }

log_gb('test run gb', params_test_gb, pd_train_w_q1_wo_resamp, pd_test_w_q1_4_predict, pd_test_w_q1_working)

#### It works!

## Model Runs

### No Resampling, w/ q1 data 

**RF**

* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

**GB**
* loss - 'deviance'
* learning_rate - 0.3, 0.5, 0.7, 0.9
* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

In [15]:
for depth in np.arange(1, 11):
  for feature_no in ['sqrt', None]:
    params_rf_w_q1_wo_resamp = {'n_estimators': 1000,
                                'max_depth': depth,
                                'max_features': feature_no,
                                'random_state': 123
                               }
    log_rf('rf_w_q1_wo_resamp',
           params_rf_w_q1_wo_resamp,
           pd_train_w_q1_wo_resamp,
           pd_test_w_q1_4_predict,
           pd_test_w_q1_working)

In [16]:
for depth in np.arange(1, 11):
  for learning_rate in [0.3, 0.5, 0.7, 0.9]:
    for feature_no in ['sqrt', None]:
      params_gb_w_q1_wo_resamp = {'loss': 'deviance',
                                  'learning_rate': learning_rate,
                                  'n_estimators': 1000,
                                  'max_depth': depth,
                                  'max_features': feature_no,
                                  'random_state': 123
                                 }
      log_gb('gb_w_q1_wo_resamp',
             params_gb_w_q1_wo_resamp,
             pd_train_w_q1_wo_resamp,
             pd_test_w_q1_4_predict,
             pd_test_w_q1_working)

### Undersampling, w/ q1 data 

**RF**

* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

**GB**
* loss - 'deviance'
* learning_rate - 0.3, 0.5, 0.7, 0.9
* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

In [18]:
for depth in np.arange(1, 11):
  for feature_no in ['sqrt', None]:
    params_rf_w_q1_undersamp = {'n_estimators': 1000,
                                'max_depth': depth,
                                'max_features': feature_no,
                                'random_state': 123
                               }
    log_rf('rf_w_q1_undersamp',
           params_rf_w_q1_undersamp,
           pd_train_w_q1_undersamp,
           pd_test_w_q1_4_predict,
           pd_test_w_q1_working)

In [19]:
for depth in np.arange(1, 11):
  for learning_rate in [0.3, 0.5, 0.7, 0.9]:
    for feature_no in ['sqrt', None]:
      params_gb_w_q1_undersamp = {'loss': 'deviance',
                                  'learning_rate': learning_rate,
                                  'n_estimators': 1000,
                                  'max_depth': depth,
                                  'max_features': feature_no,
                                  'random_state': 123
                                 }
      log_gb('gb_w_q1_undersamp',
             params_gb_w_q1_undersamp,
             pd_train_w_q1_undersamp,
             pd_test_w_q1_4_predict,
             pd_test_w_q1_working)

### Oversampling, w/ q1 data 

**RF**

* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

**GB**
* loss - 'deviance'
* learning_rate - 0.3, 0.5, 0.7, 0.9
* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

In [21]:
for depth in np.arange(1, 11):
  for feature_no in ['sqrt', None]:
    params_rf_w_q1_oversamp = {'n_estimators': 1000,
                               'max_depth': depth,
                               'max_features': feature_no,
                               'random_state': 123
                              }
    log_rf('rf_w_q1_oversamp',
           params_rf_w_q1_oversamp,
           pd_train_w_q1_oversamp,
           pd_test_w_q1_4_predict,
           pd_test_w_q1_working)

In [22]:
for depth in np.arange(1, 11):
  for learning_rate in [0.3, 0.5, 0.7, 0.9]:
    for feature_no in ['sqrt', None]:
      params_gb_w_q1_oversamp = {'loss': 'deviance',
                                 'learning_rate': learning_rate,
                                 'n_estimators': 1000,
                                 'max_depth': depth,
                                 'max_features': feature_no,
                                 'random_state': 123
                                }
      log_gb('gb_w_q1_oversamp',
             params_gb_w_q1_oversamp,
             pd_train_w_q1_oversamp,
             pd_test_w_q1_4_predict,
             pd_test_w_q1_working)

### SMOTE, w/ q1 data 

**RF**

* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

**GB**
* loss - 'deviance'
* learning_rate - 0.3, 0.5, 0.7, 0.9
* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

In [24]:
for depth in np.arange(1, 11):
  for feature_no in ['sqrt', None]:
    params_rf_w_q1_smote = {'n_estimators': 1000,
                            'max_depth': depth,
                            'max_features': feature_no,
                            'random_state': 123
                           }
    log_rf('rf_w_q1_smote',
           params_rf_w_q1_smote,
           pd_train_w_q1_smote,
           pd_test_w_q1_4_predict,
           pd_test_w_q1_working)

In [25]:
for depth in np.arange(1, 11):
  for learning_rate in [0.3, 0.5, 0.7, 0.9]:
    for feature_no in ['sqrt', None]:
      params_gb_w_q1_smote = {'loss': 'deviance',
                              'learning_rate': learning_rate,
                              'n_estimators': 1000,
                              'max_depth': depth,
                              'max_features': feature_no,
                              'random_state': 123
                             }
      log_gb('gb_w_q1_smote',
             params_gb_w_q1_smote,
             pd_train_w_q1_smote,
             pd_test_w_q1_4_predict,
             pd_test_w_q1_working)

Have tried the code above twice, cluster times out after 2 hours and shuts down. 
However, ML Flow managed to log the previous runs, so we can continue 
from where the cluster shut down.

In [27]:
for depth in np.arange(9, 11):
  for learning_rate in [0.3, 0.5, 0.7, 0.9]:
    for feature_no in ['sqrt', None]:
      params_gb_w_q1_smote = {'loss': 'deviance',
                              'learning_rate': learning_rate,
                              'n_estimators': 1000,
                              'max_depth': depth,
                              'max_features': feature_no,
                              'random_state': 123
                             }
      log_gb('gb_w_q1_smote',
             params_gb_w_q1_smote,
             pd_train_w_q1_smote,
             pd_test_w_q1_4_predict,
             pd_test_w_q1_working)

### No Resampling, w/o q1 data 

**RF**

* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

**GB**
* loss - 'deviance'
* learning_rate - 0.3, 0.5, 0.7, 0.9
* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

In [29]:
for depth in np.arange(1, 11):
  for feature_no in ['sqrt', None]:
    params_rf_wo_q1_wo_resamp = {'n_estimators': 1000,
                                 'max_depth': depth,
                                 'max_features': feature_no,
                                 'random_state': 123
                                }
    log_rf('rf_wo_q1_wo_resamp',
           params_rf_wo_q1_wo_resamp,
           pd_train_wo_q1_wo_resamp,
           pd_test_wo_q1_4_predict,
           pd_test_wo_q1_working)

In [30]:
for depth in np.arange(1, 11):
  for learning_rate in [0.3, 0.5, 0.7, 0.9]:
    for feature_no in ['sqrt', None]:
      params_gb_wo_q1_wo_resamp = {'loss': 'deviance',
                                   'learning_rate': learning_rate,
                                   'n_estimators': 1000,
                                   'max_depth': depth,
                                   'max_features': feature_no,
                                   'random_state': 123
                                  }
      log_gb('gb_wo_q1_wo_resamp',
             params_gb_wo_q1_wo_resamp,
             pd_train_wo_q1_wo_resamp,
             pd_test_wo_q1_4_predict,
             pd_test_wo_q1_working)

### Undersampling, w/o q1 data 

**RF**

* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

**GB**
* loss - 'deviance'
* learning_rate - 0.3, 0.5, 0.7, 0.9
* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

In [32]:
for depth in np.arange(1, 11):
  for feature_no in ['sqrt', None]:
    params_rf_wo_q1_undersamp = {'n_estimators': 1000,
                                 'max_depth': depth,
                                 'max_features': feature_no,
                                 'random_state': 123
                                }
    log_rf('rf_wo_q1_undersamp',
           params_rf_wo_q1_undersamp,
           pd_train_wo_q1_undersamp,
           pd_test_wo_q1_4_predict,
           pd_test_wo_q1_working)

In [33]:
for depth in np.arange(1, 11):
  for learning_rate in [0.3, 0.5, 0.7, 0.9]:
    for feature_no in ['sqrt', None]:
      params_gb_wo_q1_undersamp = {'loss': 'deviance',
                                   'learning_rate': learning_rate,
                                   'n_estimators': 1000,
                                   'max_depth': depth,
                                   'max_features': feature_no,
                                   'random_state': 123
                                  }
      log_gb('gb_wo_q1_undersamp',
             params_gb_wo_q1_undersamp,
             pd_train_wo_q1_undersamp,
             pd_test_wo_q1_4_predict,
             pd_test_wo_q1_working)

### Oversampling, w/o q1 data 

**RF**

* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

**GB**
* loss - 'deviance'
* learning_rate - 0.3, 0.5, 0.7, 0.9
* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

In [35]:
for depth in np.arange(1, 11):
  for feature_no in ['sqrt', None]:
    params_rf_wo_q1_oversamp = {'n_estimators': 1000,
                               'max_depth': depth,
                               'max_features': feature_no,
                               'random_state': 123
                              }
    log_rf('rf_wo_q1_oversamp',
           params_rf_wo_q1_oversamp,
           pd_train_wo_q1_oversamp,
           pd_test_wo_q1_4_predict,
           pd_test_wo_q1_working)

In [36]:
for depth in np.arange(1, 11):
  for learning_rate in [0.3, 0.5, 0.7, 0.9]:
    for feature_no in ['sqrt', None]:
      params_gb_wo_q1_oversamp = {'loss': 'deviance',
                                 'learning_rate': learning_rate,
                                 'n_estimators': 1000,
                                 'max_depth': depth,
                                 'max_features': feature_no,
                                 'random_state': 123
                                }
      log_gb('gb_wo_q1_oversamp',
             params_gb_wo_q1_oversamp,
             pd_train_wo_q1_oversamp,
             pd_test_wo_q1_4_predict,
             pd_test_wo_q1_working)

### SMOTE, w/o q1 data 

**RF**

* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

**GB**
* loss - 'deviance'
* learning_rate - 0.3, 0.5, 0.7, 0.9
* n_estimators - 1000
* max_depth - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
* max_features - 'sqrt', None
* random_state - 123

In [38]:
for depth in np.arange(1, 11):
  for feature_no in ['sqrt', None]:
    params_rf_wo_q1_smote = {'n_estimators': 1000,
                            'max_depth': depth,
                            'max_features': feature_no,
                            'random_state': 123
                           }
    log_rf('rf_wo_q1_smote',
           params_rf_wo_q1_smote,
           pd_train_wo_q1_smote,
           pd_test_wo_q1_4_predict,
           pd_test_wo_q1_working)

In [39]:
for depth in np.arange(1, 11):
  for learning_rate in [0.3, 0.5, 0.7, 0.9]:
    for feature_no in ['sqrt', None]:
      params_gb_wo_q1_smote = {'loss': 'deviance',
                              'learning_rate': learning_rate,
                              'n_estimators': 1000,
                              'max_depth': depth,
                              'max_features': feature_no,
                              'random_state': 123
                             }
      log_gb('gb_wo_q1_smote',
             params_gb_wo_q1_smote,
             pd_train_wo_q1_smote,
             pd_test_wo_q1_4_predict,
             pd_test_wo_q1_working)