# `ml`
> Utility functions that can be used ML jobs and Kaggle.

In [None]:
#|default_exp ml

Reference for kaggle API: https://github.com/Kaggle/kaggle-api

In [None]:
#| hide
from nbdev import show_doc, nbdev_export
from fastcore.test import test_fail

In [None]:
#| export
from __future__ import annotations
from IPython.display import Image, display
from pathlib import Path
from pprint import pprint
from typing import Any, List
from zipfile import ZipFile

import configparser
import datetime as dt
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import shutil
import subprocess

# Working with datasets

In [None]:
#| export
def are_features_consistent(
    df1:pd.DataFrame,                 # First set, typically the training set
    df2:pd.DataFrame,                 # Second set, typically the test set or inference set 
    dependent_variables:list[str] = None, # List of column name(s) for dependent variables
    raise_error:bool = False,             # True to raise an error if not consistent
)-> bool :                                # True if features in train and test datasets are consistent, False otherwise
    """Verify that features/columns in training and test sets are consistent"""
    if dependent_variables is None:
        features_df1 = df1.columns
    else:
        features_df1 = df1.drop(dependent_variables, axis=1).columns
    features_df2 = df2.columns
    features_diff = set(features_df1).symmetric_difference(features_df2)
    if features_diff == set():
        return True
    else:
        if raise_error:
            raise ValueError(f"Discrepancy between training and test feature set: {features_diff}")
        else: return False

Training set and test set should have the same features/columns, except for the dependent variable(s). This function tests whether this is the case.

In [None]:
feats = [f"Feature_{i:02d}" for i in range(10)]
X_train = pd.DataFrame(np.random.normal(size=(500, 10)), columns=feats)
X_test = pd.DataFrame(np.random.normal(size=(100, 10)), columns=feats)
X_test_not_consistant = X_test.iloc[:, 2:]
display(X_train.head(3))
display(X_test.head(3))
display(X_test_not_consistant.head(3))

Unnamed: 0,Feature_00,Feature_01,Feature_02,Feature_03,Feature_04,Feature_05,Feature_06,Feature_07,Feature_08,Feature_09
0,1.49547,-2.135659,0.149497,-0.315122,0.424538,1.121869,1.634374,0.228302,1.68095,-0.614686
1,0.862787,-1.811697,-0.823006,-0.379604,0.013245,0.252675,1.13497,-0.158523,0.948921,-1.772254
2,0.061188,0.239281,0.871334,0.043956,-0.032998,-0.10642,-0.324127,-0.285553,1.883193,0.495051


Unnamed: 0,Feature_00,Feature_01,Feature_02,Feature_03,Feature_04,Feature_05,Feature_06,Feature_07,Feature_08,Feature_09
0,-0.683395,0.004053,0.113029,-0.298751,0.743429,-0.450048,0.180438,-0.996492,-1.555506,0.852494
1,1.642463,-0.532377,-0.052468,-1.155821,-1.127104,-1.024159,-0.446747,-0.344672,0.898792,1.444979
2,0.47992,0.263056,1.67434,-0.183054,-0.290559,0.47266,-0.646435,-0.232078,-0.626567,-1.175488


Unnamed: 0,Feature_02,Feature_03,Feature_04,Feature_05,Feature_06,Feature_07,Feature_08,Feature_09
0,0.113029,-0.298751,0.743429,-0.450048,0.180438,-0.996492,-1.555506,0.852494
1,-0.052468,-1.155821,-1.127104,-1.024159,-0.446747,-0.344672,0.898792,1.444979
2,1.67434,-0.183054,-0.290559,0.47266,-0.646435,-0.232078,-0.626567,-1.175488


Compare all the features/columns

In [None]:
are_features_consistent(X_train, X_test)

True

In [None]:
are_features_consistent(X_train, X_test_not_consistant)

False

`are_features_consistent(X_train, X_test_not_consistant, raise_error=True)` should raise an error instead of returning False

In [None]:
test_fail(
    f=are_features_consistent, 
    args=(X_train, X_test_not_consistant),
    kwargs = {'raise_error':True},
    contains="Discrepancy between training and test feature set:",
    msg=f"Should raise a ValueError"
)

When comparing training and inference set, the training set will have more features as it includes the dependant variables. To test the consistency of the datasets, specify whith columns are dependant variables.

For instance, X_train has all features, including the two dependant variables `Feature_08` and `Feature_09`.

In [None]:
X_inference = X_train.iloc[:, :-2]
display(X_train.head(3))
display(X_inference.head(3))

Unnamed: 0,Feature_00,Feature_01,Feature_02,Feature_03,Feature_04,Feature_05,Feature_06,Feature_07,Feature_08,Feature_09
0,1.49547,-2.135659,0.149497,-0.315122,0.424538,1.121869,1.634374,0.228302,1.68095,-0.614686
1,0.862787,-1.811697,-0.823006,-0.379604,0.013245,0.252675,1.13497,-0.158523,0.948921,-1.772254
2,0.061188,0.239281,0.871334,0.043956,-0.032998,-0.10642,-0.324127,-0.285553,1.883193,0.495051


Unnamed: 0,Feature_00,Feature_01,Feature_02,Feature_03,Feature_04,Feature_05,Feature_06,Feature_07
0,1.49547,-2.135659,0.149497,-0.315122,0.424538,1.121869,1.634374,0.228302
1,0.862787,-1.811697,-0.823006,-0.379604,0.013245,0.252675,1.13497,-0.158523
2,0.061188,0.239281,0.871334,0.043956,-0.032998,-0.10642,-0.324127,-0.285553


In [None]:
are_features_consistent(X_train, X_inference, dependent_variables=['Feature_08', 'Feature_09'])

True

# Kaggle

In [None]:
#| export
def kaggle_setup_colab(path_to_config_file:Path|str = None      # path to the configuration file (e.g. config.cfg)
                      ):
    """Update kaggle API and create security key json file from config file on Google Drive"""
    # Create API security key file
    path_to_kaggle = Path('/root/.kaggle')
    os.makedirs(path_to_kaggle, exist_ok=True)
    
    # Validate path_to_config
    if path_to_config_file is None:
        path_to_config_file = Path('/content/gdrive/MyDrive/private-across-accounts/config-api-keys.cfg')
    if isinstance(path_to_config_file, str): 
        path_to_config_file = Path(path_to_config_file)
    if not path_to_config_file.is_file():
        raise ValueError(f"No file at {path_to_config_file.absolute()}. Check the path")
    
    # retrieve configuration, create token and save it
    username = get_config_value('kaggle', 'kaggle_username', path_to_config_file=path_to_config_file)
    key = get_config_value('kaggle', 'kaggle_key', path_to_config_file=path_to_config_file)

    api_token = {"username": username, "key": key}
    with open(path_to_kaggle / 'kaggle.json', 'w') as file:
        json.dump(api_token, file)
        os.fchmod(file.fileno(), 600)

    # Update kaggle API software
    run_cli('pip install -Uqq kaggle --upgrade')

#### Technical Background
References: 
[Kaggle API documentation](https://github.com/Kaggle/kaggle-api)

Kaggle API Token to be placed as a json file at the following location:
```
    ~/.kaggle/kaggle.json
    %HOMEPATH%\.kaggle\kaggle.json
```
To access Kaggle with API, a security key needs to be placed in the correct location on colab.

`config.cfg` file must include the following lines:
```
    [kaggle]
    kaggle_username = kaggle_user_name
    kaggle_key = API key provided by kaggle
```

Info on how to get an api key (kaggle.json) [here](https://github.com/Kaggle/kaggle-api#api-credentials)

In [None]:
#| export
def kaggle_list_files(code:str = None,          # code for the kaggle competition or dataset
                      mode:str ='competitions'  # mode: `competitions` or `datasets`
                     ):
    """List all files available in the competition or dataset for the passed code"""
    if code is None:
        print(f"code is None, please provide the code of the kaggle competition or dataset")
        return 'Failed'
    elif mode not in ['competitions', 'datasets']:
        print(f"mode must be either 'competitions' or 'datasets', not {mode}")
        return 'Failed'
    else:
        print(f"Listing the files available for {mode}: <{code}>")
        run_cli(f"kaggle {mode} files {code}")

    if mode == 'competitions':
        print(f"{'=' * 140}")
        print(f"Make sure to set the parameters for <{code}> in next cell:")
        print(f" - kaggle_project_folder_name: string with name of the project folder")
        print(f" - train_files: list of files to place into the <train> folder")
        print(f" - test_files: list of files to place into the <test> folder")
        print(f" - submit_files: list of files to place into the <submit> folder")
        print(f"{'=' * 140}")

In [None]:
#| export
def kaggle_download_competition_files(
    competition_code:str = None, 
    train_files:list() = [], 
    test_files:list = [], 
    submit_files:list = [], 
    project_folder:str = 'ds'
    ):
    """download all files for passed competition, unzip them if required, move them to train, test and submit folders

    competition_code: str       code of the kaggle competition
    train_files: list of str    names of files to be moved into train folder
    test_files: list of str     names of files to be moved into test folder
    submit_files: list of str   names of files to be moved into submit folder
    """
    if competition_code is None:
        print(f"competition_code is None, please provide the code of the kaggle competition")
        return 'Failed'
    else:
        list_of_datasets = {'train': train_files,
                            'test': test_files,
                            'submit': submit_files}

        # creating a project directory and set paths
        if not os.path.exists(project_folder):
            os.makedirs(project_folder)

        path2datasets = Path(f"/content/{project_folder}")
        path2datasets_str = str(path2datasets.absolute())

        # download all files from kaggle
        run_cli(f"kaggle competitions download -c {competition_code} -p {path2datasets}")

        print(f"{'=' * 140}")
        print('Downloaded files:')
        for f in [item for item in path2datasets.iterdir() if item.is_file()]:
            print(f" - {f}")
        print(f"{'=' * 140}")

        # Unzip all zipped files
        for f in path2datasets.glob('*.zip'):
            print(f"Unzipping {f.name}")
            zip_f = ZipFile(f)
            zip_f.extractall(path=path2datasets)
            os.remove(f)
        print(f"{'=' * 140}")

        # Move all data files to the correct data folder
        for dataset_folder, files in list_of_datasets.items():
            if not os.path.exists(f'{project_folder}/{dataset_folder}'):
                os.makedirs(f'{project_folder}/{dataset_folder}')

            for f in files:
                print(f"Moving {f} to {dataset_folder}")
                p2f = path2datasets / f
                if p2f.suffix == '.csv':
                    shutil.move(path2datasets / f, path2datasets / dataset_folder / f)
                else:
                    msg = f"Does not support {p2f.name}'s extension {p2f.suffix}"
                    raise RuntimeError(msg)

        print(f"{'=' * 140}")
        print('Done loading Kaggle files and moving them to corresponding folders')

# Others

In [None]:
#| export
def fastbook_on_colab():
    """Set up environment to run fastbook notebooks for colab"""
    instructions = ['pip install -Uqq fastbook',
                    'wget -O utils.py https://raw.githubusercontent.com/vtecftwy/fastbook/walk-thru/utils.py',
                    'wget -O fastbook_utils.py https://raw.githubusercontent.com/vtecftwy/fastbook/walk-thru/fastbook_utils.py'
                    ]

Code extracted from fastbook notebook:
```python
# Install fastbook and dependencies
!pip install -Uqq fastbook

# Load utilities and install them
!wget -O utils.py https://raw.githubusercontent.com/vtecftwy/fastbook/walk-thru/utils.py
!wget -O fastbook_utils.py https://raw.githubusercontent.com/vtecftwy/fastbook/walk-thru/fastbook_utils.py

from fastbook_utils import *
from utils import *

# Setup My Drive
setup_book()

# Download images and code required for this notebook
import os
os.makedirs('images', exist_ok=True)
!wget -O images/chapter1_cat_example.jpg https://raw.githubusercontent.com/vtecftwy/fastai-course-v4/master/nbs/images/chapter1_cat_example.jpg
!wget -O images/cat-01.jpg https://raw.githubusercontent.com/vtecftwy/fastai-course-v4/walk-thru/nbs/images/cat-01.jpg
!wget -O images/cat-02.jpg https://raw.githubusercontent.com/vtecftwy/fastai-course-v4/walk-thru/nbs/images/cat-02.jpg
!wget -O images/dog-01.jpg https://raw.githubusercontent.com/vtecftwy/fastai-course-v4/walk-thru/nbs/images/dog-01.jpg
!wget -O images/dog-02.jpg https://raw.githubusercontent.com/vtecftwy/fastai-course-v4/walk-thru/nbs/images/dog-01.jpg
```

In [None]:
#| hide
nbdev_export()