In [None]:
# instalação do Weights & Biases
!pip install wandb

Collecting wandb
  Downloading wandb-0.16.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.39.1-py2.py3-none-any.whl (254 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.1/254.1 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->w

In [None]:
# importação das bibliotecas
import logging
import tempfile
import pandas as pd
import os
import wandb
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import random

In [None]:
# wandb login
!wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Segregação dos Dados

In [None]:
# inicialização de uma nova run no wandb
run = wandb.init(entity="flamigos", project="cnn_animation", job_type="split_data")

[34m[1mwandb[0m: Currently logged in as: [33mclaudio-henrique[0m ([33mflamigos[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# buscando o artefato de imagens no wandb
artifact = run.use_artifact('flamigos/cnn_animation/img_data:latest', type='clean_data').file()
data_img = np.load(artifact)

# bsucando o artefato de rótulos no wandb
artifact2 = run.use_artifact('flamigos/cnn_animation/label_data:latest', type='clean_data').file()
labels = pd.read_csv(artifact2)
labels = labels["animation Type"]
labels = labels.values

In [None]:
# utilizamos o train_test_split para dividir ps nossos 2 arefatos em arquivos de teste e treino
data_img_train, data_img_test, labels_train, labels_test = train_test_split(data_img, labels, test_size=0.2, random_state=42, stratify=labels)

In [None]:
if(len(data_img_train) == len(labels_train)):
  print("quantidade de valores do data_img_train é igual a quantidade de valores do labels_train")
else:
  print("quantidade de valores do data_img_train não é igual a quantidade de valores do labels_train")

valores de treino iguais


In [None]:
if(len(data_img_test) == len(labels_test)):
  print("quantidade de valores do data_img_test é igual a quantidade de valores do labels_test")
else:
  print("quantidade de valores do data_img_test não é igual a quantidade de valores do labels_test")

valores de teste iguais


### Verificando os resultados da nossa divisão

In [None]:
# verificando a parte de treino
labels_value = {0:'Anime', 1:'Cartoon'}
for i in range(25):
    plt.figure()
    plt.title(labels_value[labels_train[i]])
    plt.imshow(data_img_train[i])

In [None]:
# verificando a parte de test
labels_value = {0:'Anime', 1:'Cartoon'}
for i in range(25):
    plt.figure()
    plt.title(labels_value[labels_test[i]])
    plt.imshow(data_img_test[i])

### Salvando os dados das imagens em um arquivo numpy

In [None]:
# treino
np.save('img_data_train.npy', data_img_train)
loaded_data_train = np.load('img_data_train.npy')

# teste
np.save('img_data_test.npy', data_img_test)
loaded_data_test = np.load('img_data_test.npy')

### Salvando os rótulos em um arquivo CSV

In [None]:
# treino
data = {"animation Type" : labels_train}
df = pd.DataFrame(data)
df.to_csv("label_data_train.csv", index=False)

# teste
data = {"animation Type" : labels_test}
df = pd.DataFrame(data)
df.to_csv("label_data_test.csv", index=False)

### Enviando artefatos de treino e teste para o wandb

In [None]:
# criando um artefato no wandb
artifact = wandb.Artifact(name="img_data_train",
                                  type="split_data",
                                  description="train data of the images")

# adiciona o arquivo no artefato
artifact.add_file('img_data_train.npy')
# salva o artefato no wandb
wandb.run.log_artifact(artifact)

# criando um artefato no wandb
artifact2 = wandb.Artifact(name="img_data_test",
                                  type="split_data",
                                  description="test data of the images")

# adiciona o arquivo no artefato
artifact2.add_file('img_data_test.npy')
# salva o artefato no wandb
wandb.run.log_artifact(artifact2)

# criando um artefato no wandb
artifact3 = wandb.Artifact(name="label_data_train",
                                  type="split_data",
                                  description="labels of the train imagens")

# adiciona o arquivo no artefato
artifact3.add_file('label_data_train.csv')
# salva o artefato no wandb
wandb.run.log_artifact(artifact3)

# criando um artefato no wandb
artifact4 = wandb.Artifact(name="label_data_test",
                                  type="split_data",
                                  description="labels of the test imagens")

# adiciona o arquivo no artefato
artifact4.add_file('label_data_test.csv')
# salva o artefato no wandb
wandb.run.log_artifact(artifact4)

<Artifact label_data_test>

In [None]:
# termina a run
run.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))