Ensure you have https://cloud.google.com/sdk/docs/install?hl=pt-br installed

Run gcloud init

In [3]:
# Install the packages
! pip3 install -r requirements.txt

In [9]:
# Set the project id
# Created via GCP UI
! gcloud config set project {"topic-modelling-fiap-417601"} 

Updated property [core/project].


In [10]:
REGION = "us-central1"
SAC_UNIQUEID = "115532493627132210234" # Created via GCP UI
BUCKET_URI = f"gs://tm-fiap-bucket" # Created via GCP UI

In [4]:
# Imported new csv dataset from here just for 'playing around' purposes since it's been previously
# uploaded to storage bucket via GCP UI (this dataset is not bein published to the git repo)
# to prevent having duplicate data on the repo (same as preprocess_dataset.csv)

IMPORT_FILE = "pre_processed.csv"
!gsutil cp {IMPORT_FILE} {BUCKET_URI}/data/

gcs_source = f"{BUCKET_URI}/data/{IMPORT_FILE}"

Copying file://pre_processed.csv [Content-Type=application/vnd.ms-excel]...
/ [0 files][    0.0 B/ 17.8 MiB]                                                
/ [0 files][320.0 KiB/ 17.8 MiB]                                                
-
\
\ [0 files][  4.4 MiB/ 17.8 MiB]                                                
|
/
/ [0 files][ 15.3 MiB/ 17.8 MiB]                                                
/ [1 files][ 17.8 MiB/ 17.8 MiB]                                                
-

Operation completed over 1 objects/17.8 MiB.                                     


In [11]:
! gcloud auth login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=ECciS04e6RDV5rfs2QAPPBLaeR9zdI&access_type=offline&code_challenge=UY33LF6g5VXmxUt_j0E67_iak8L-bziAQIEOd6FtraQ&code_challenge_method=S256


You are now logged in as [RM349721@fiap.com.br].
Your current project is [topic-modelling-fiap-417601].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


In [12]:
# New way to authenticate without exposing the credentials
import base64
import json

from google.cloud import aiplatform
from google.oauth2 import service_account

# Load the encoded string from the new text file
with open('encoded-auth.txt', 'r') as encoded_file:
    encoded_string = encoded_file.read()

# Decode the string back to JSON
decoded_bytes = base64.b64decode(encoded_string)
decoded_json = json.loads(decoded_bytes.decode('utf-8'))

# Create credentials from the decoded JSON
my_credentials = service_account.Credentials.from_service_account_info(decoded_json)

aiplatform.init(
    # your Google Cloud Project ID or number
    # environment default used is not set
    project='topic-modelling-fiap-417601',

    # the Vertex AI region you will use
    # defaults to us-central1
    location='us-central1',

    # Google Cloud Storage bucket in same region as location
    # used to stage artifacts
    staging_bucket='gs://tm-fiap-bucket',

    # custom google.auth.credentials.Credentials
    # environment default credentials used if not set
    credentials=my_credentials

)

In [12]:
# Created dataset in GCP Vertex AI

ds = dataset = aiplatform.TabularDataset.create(
    display_name="my-dataset",
    gcs_source=gcs_source,
)

ds.resource_name

Creating TabularDataset
Create TabularDataset backing LRO: projects/663156742437/locations/us-central1/datasets/8412333777300226048/operations/1912496057271451648
TabularDataset created. Resource name: projects/663156742437/locations/us-central1/datasets/8412333777300226048
To use this TabularDataset in another session:
ds = aiplatform.TabularDataset('projects/663156742437/locations/us-central1/datasets/8412333777300226048')


'projects/663156742437/locations/us-central1/datasets/8412333777300226048'

In [13]:
# Tried training model with Professor's data splitting recommendation

job = aiplatform.AutoMLTabularTrainingJob(
    display_name="train-ce-tm_fiap-automl-1",
    optimization_prediction_type="classification",
    column_transformations=[
        {"categorical": {"column_name": "descricao_reclamacao_processed"}}
    ],
)

# This takes about an hour to run
model = job.run(
    dataset=ds,
    target_column="categoria",
    training_fraction_split=0.75,
    validation_fraction_split=0,
    test_fraction_split=0.25,
    model_display_name="topicmodeling-prediction-model",
    disable_early_stopping=False,
)

InvalidArgument: 400 Model training requires each of the training, validation, and test split fractions to be more than 0. The actual splits within the request are 0.75 Training, 0.00 Validation and 0.25 Test.

In [14]:
# Since Vertex AI does not accept models to be trained without validation split fractions,
# adjusted to accomodate such requirements 80/10/10

job = aiplatform.AutoMLTabularTrainingJob(
    display_name="train-ce-tm_fiap-automl-1",
    optimization_prediction_type="classification",
    column_transformations=[
        {"categorical": {"column_name": "descricao_reclamacao_processed"}}
    ],
)

# This takes about an hour to run
model = job.run(
    dataset=ds,
    target_column="categoria",
    training_fraction_split=0.80,
    validation_fraction_split=0.10,
    test_fraction_split=0.10,
    model_display_name="topicmodeling-prediction-model",
    disable_early_stopping=False,
)

View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/3169504926065754112?project=663156742437
AutoMLTabularTrainingJob projects/663156742437/locations/us-central1/trainingPipelines/3169504926065754112 current state:
3
AutoMLTabularTrainingJob projects/663156742437/locations/us-central1/trainingPipelines/3169504926065754112 current state:
3
AutoMLTabularTrainingJob projects/663156742437/locations/us-central1/trainingPipelines/3169504926065754112 current state:
3
AutoMLTabularTrainingJob projects/663156742437/locations/us-central1/trainingPipelines/3169504926065754112 current state:
3
AutoMLTabularTrainingJob projects/663156742437/locations/us-central1/trainingPipelines/3169504926065754112 current state:
3
AutoMLTabularTrainingJob projects/663156742437/locations/us-central1/trainingPipelines/3169504926065754112 current state:
3
AutoMLTabularTrainingJob projects/663156742437/locations/us-central1/trainingPipelines/3169504926065754112 current state:
3


In [15]:
# Tried deploying trained model to Vertex AI

endpoint = model.deploy(
    machine_type="n1-standard-4",
)

Creating Endpoint
Create Endpoint backing LRO: projects/663156742437/locations/us-central1/endpoints/8798632894068162560/operations/35972756285685760
Endpoint created. Resource name: projects/663156742437/locations/us-central1/endpoints/8798632894068162560
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/663156742437/locations/us-central1/endpoints/8798632894068162560')
Deploying model to Endpoint : projects/663156742437/locations/us-central1/endpoints/8798632894068162560
Deploy Endpoint model backing LRO: projects/663156742437/locations/us-central1/endpoints/8798632894068162560/operations/1947856351220727808


ResourceExhausted: 429 The following quotas are exceeded: CustomModelServingCPUsPerProjectPerRegion 8: The following quotas are exceeded: CustomModelServingCPUsPerProjectPerRegion

In [15]:
# Since I'm not able to deploy multiple models to my GCP Vertex AI service
# due to the reason why I'm using the trial license of it and I'm not willing
# to spend money on it, I'll use my previously created/trained/deployed model
# to make a prediction on this notebook

ENDPOINT_ID = "6120385296526737408"
endpoint = aiplatform.Endpoint(endpoint_name=ENDPOINT_ID)

In [17]:
prediction = endpoint.predict(
    [
        {  
            "descricao_reclamacao_processed": "nao consigo acessar minha conta corrente"
        }
    ]
)

print(prediction)

Prediction(predictions=[{'classes': ['Serviços de conta bancária', 'Cartão de crédito / Cartão pré-pago', 'Roubo / Relatório de disputa', 'Hipotecas / Empréstimos', 'Outros'], 'scores': [0.7445319294929504, 0.1022438183426857, 0.05848507583141327, 0.05643460154533386, 0.03830454126000404]}], deployed_model_id='8464988289642790912', metadata=None, model_version_id='1', model_resource_name='projects/663156742437/locations/us-central1/models/3057030452821884928', explanations=None)
