In [None]:
import pandas as pd
import requests
from requests.auth import HTTPBasicAuth
from typing import NamedTuple

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from kfp.components import InputPath, OutputPath

import kfp 
from kfp import compiler
import kfp.dsl as dsl
import kfp.notebook
import kfp.components as comp
from kfp.components import func_to_container_op


In [None]:
def get_token(login: str, password: str) -> str:
    import requests
    from requests.auth import HTTPBasicAuth
    resp = requests.get("https://service-public.nc/api/login-token", auth=HTTPBasicAuth(login, password))
    return resp.json()['access_token']

In [None]:
get_token_op = comp.func_to_container_op(get_token, packages_to_install=["requests"])

In [None]:
def download_dossier(token: str, data_path: OutputPath(str)) -> []:
    import requests
    import json
    from typing import NamedTuple
    dossiers = []
    flag = True
    url = "https://service-public.nc/api/dossiers?access_token=%s" % token
    while flag:
      res = requests.get(url)
      dossiers.extend(res.json()['data'])
      links = res.json()['links']
      if "next" in links:
        url = res.json()['links']["next"]
      else:
        flag = False
    
    import json
    with open(data_path+"dossiers.json", 'w') as f:
        json.dump(dossiers, f)
    
    return 
    
    

In [None]:
download_dossier_op = comp.func_to_container_op(download_dossier, packages_to_install=["requests", "pandas", "sklearn"])

In [None]:
def download_themes(token: str) -> []:
    import requests
    themes = {}
    flag = True
    url = "https://service-public.nc/api/themes?access_token=%s" % token
    while flag:
      res = requests.get(url)
      for t in res.json()['data']:
        themes[t['id']] = t['attributes']['nom']
      links = res.json()['links']
      if "next" in links:
        url = res.json()['links']["next"]
      else:
        flag = False
    print(themes)
    return themes

In [None]:
download_themes_op = comp.func_to_container_op(download_themes, packages_to_install=["requests"])


In [None]:
@func_to_container_op
def merge_dossier_et_themes(themes, source_path: InputPath(str), output_text_path: OutputPath(str)):
    import json
    dossiers = jslon.load(source_path+"dossiers.json")
    csv = []
    # id, nom, description, url, 
    for d in dossiers:
        csv.append([d['id'], d['attributes']['nom'], d['attributes']['description'], d['attributes']['url'], list(map(lambda t: t['id'], d['relationships']['themes']['data']) ) ])
    import pandas as pd
    df = pd.DataFrame(csv, columns=["id", "nom", "description", "url", "themes"])

    from sklearn.preprocessing import MultiLabelBinarizer
    from sklearn.model_selection import train_test_split
    mlb = MultiLabelBinarizer(sparse_output=True)

    df = df.join(
                pd.DataFrame.sparse.from_spmatrix(
                    mlb.fit_transform(df.pop('themes')),
                    index=df.index,
                    columns=mlb.classes_))
    tmp = []
    for t in themes:
        tmp.append([t, themes[t]])
    themes = tmp
    for col in df.columns:
        if col in themes:
            df = df.rename(columns={col: themes[col]})
    df_train, df_test = train_test_split(df, test_size=0.2)
    df_train.to_csv(output_text_path+'train.csv')
    df_test.to_csv(output_text_path+'test.csv')

In [None]:
from configparser import ConfigParser
parser = ConfigParser()

_ = parser.read("notebook.cfg")

login = parser.get("spnc", "login")
password = parser.get("spnc", "password")

In [None]:
@dsl.pipeline(
   name='SPNC recommander',
   description='service-public.nc recommandation system'
)
def spnc_recommander_pipeline():
    get_token_task = get_token_op(login, password)
    #print(get_token_task)
    download_themes_task = download_themes_op(get_token_task.output)

    #Passing a task output reference as operation arguments
    #For an operation with a single return value, the output reference can be accessed using `task.output` or `task.outputs['output_name']` syntax
    download_dossier_task = download_dossier_op(get_token_task.output)
    merge_dossier_et_themes_task = merge_dossier_et_themes(download_themes_task.output, download_dossier_task.outputs['data'])
    
    

In [None]:
client = kfp.Client()

client.create_run_from_pipeline_func(spnc_recommander_pipeline, arguments= {})