In [2]:
import digitalhub as dh

# Create the 'Document Classification' AIPC project

In [3]:
project = dh.get_or_create_project("aipc_text_classification")

## Create the raw data artifact

In [31]:
dataset = project.new_artifact(
                    name="data",
                    kind="artifact",
                    path="s3://datalake/text_classification/it.zip")

In [32]:
dataset.key

'store://aipc_text_classification/artifact/artifact/data:f40cb68b-a316-4a6f-8d95-06419e8b7647'

In [33]:
project.list_artifacts()

[{'kind': 'artifact', 'metadata': {'project': 'aipc_text_classification', 'name': 'data', 'version': 'f40cb68b-a316-4a6f-8d95-06419e8b7647', 'created': '2024-10-07T14:39:33.081Z', 'updated': '2024-10-07T14:39:33.081Z', 'created_by': 'tenant1userid', 'updated_by': 'tenant1userid', 'embedded': True}, 'spec': {'path': 's3://datalake/text_classification/it.zip'}, 'status': {'state': 'CREATED', 'files': [{'path': 's3://datalake/text_classification/it.zip', 'name': 'it.zip', 'size': 994345776, 'content_type': 'application/zip', 'last_modified': '2024-10-07T14:39:21.000+00:00'}]}, 'user': 'tenant1userid', 'project': 'aipc_text_classification', 'name': 'data', 'id': 'f40cb68b-a316-4a6f-8d95-06419e8b7647', 'key': 'store://aipc_text_classification/artifact/artifact/data:f40cb68b-a316-4a6f-8d95-06419e8b7647'}]

In [34]:
zip = dataset.download()

## Define the function for pre-processing the dataset

In [38]:
preprocess_function = project.new_function("preprocess1",
                         kind="python",
                         python_version="PYTHON3_10",
                         code_src="git+https://github.com/tn-aixpa/azionifamiglia-classification.git",
                         handler="aipc_project.implementation.src.preprocessing.preprocess:preprocess_data")

In [39]:
preprocess_run = preprocess_function.run(
    action="job",
    inputs={"raw_data_zip": dataset.key},
    parameter={
        "langs": "it",
        "data_path": "data/",
        "years": "1968",  # "all",
        "seeds": "110",
        "add_title": False,
        "title_only": False,
        "max_length": 512,
        "limit_tokenizer": False,
        "add_mt_do": False,
        "get_doc_ids": False,
    },
    requirements=[
        "transformers==4.26.1",
        "scikit-learn==1.2.2",
        "scikit-multilearn==0.2.0",
        "numpy==1.23.4",
        "lsg-converter==0.0.5",
        "sentence-transformers==2.2.2",
        "uvicorn==0.22.0",
        "python-dotenv==1.0.0",
        "compress_fasttext==0.1.3",
        "scipy==1.10.0",
        "nltk==3.8.1",
        "gensim==4.3.0",
        "ufal.udpipe==1.3.0.1",
        "pyyaml==6.0",
        "stop-words==2018.7.23",
        "spacy==3.5.1",
        "PageRange==0.4",
    ],
)