# Environemnt

In [53]:
import google.cloud.bigquery as bigquery
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as snsn
import numpy as np
import importlib
import subprocess

In [54]:
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer,
    TfidfTransformer,
)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from skmultilearn.problem_transform import LabelPowerset
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    balanced_accuracy_score,
    cohen_kappa_score,
)

In [55]:
import src.utils.evaluation as evaluation
import src.model.sklearn_raphael.trainer.model as sklearn_v1

In [57]:
import importlib

importlib.reload(evaluation)
importlib.reload(sklearn_v1)

<module 'src.model.sklearn_raphael.trainer.model' from '/home/raphaelprinz92/proj_NLP_text_classification_with_GCP/src/model/sklearn_raphael/trainer/model.py'>

# Local Training

In [13]:
%%bash
export PYTHONPATH=${PYTHONPATH}:${PYTHONPATH}/src/model/sklearn_raphael/
    
python -m trainer.task \
--job-dir gs://${BUCKET_NAME}/stackoverflow/sklearn \
--project_id $PROJECT_ID \
--frac 0.05 \
--WE_min_df 1. \
--WE_max_df 1.

Process is terminated.


# Local Training (Fixed)

In [32]:
bucket_name = ! echo $BUCKET_NAME
project_id = ! echo $PROJECT_ID

In [39]:
command = '''python -m trainer.task \
--job-dir gs://${{{bucket_name}}}/stackoverflow/sklearn \
--project_id ${project_id} \
--frac 0.05 \
--WE_min_df 1. \
--WE_max_df 1.'''.format(bucket_name=bucket_name[0], project_id=project_id[0])

In [51]:
process = subprocess.call(command, shell=True)

In [52]:
process

1

# Training on the AI Platform (No Hyperparameter Tuning)

In [58]:
%%bash

RUNTIME_VERSION="1.14"
PYTHON_VERSION="3.5"
JOB_DIR="gs://${BUCKET_NAME}/stackoverflow/sklearn/${JOBNAME}"
JOB_NAME=stackoverflow_sklearn_$(date +"%Y%m%d_%H%M%S")
CONFIG="$DIR_PROJ/deployment/training/sklearn/custom.yaml"
gcloud ai-platform jobs submit training $JOB_NAME \
  --job-dir $JOB_DIR \
  --package-path $DIR_PROJ/src/model/sklearn/trainer \
  --module-name trainer.task \
  --region us-central1 \
  --runtime-version=$RUNTIME_VERSION \
  --python-version=$PYTHON_VERSION \
  --config $CONFIG \
  -- \
  --project_id $PROJECT_ID \
  --max_nb_label 1000 \
  --frac 0.05

jobId: stackoverflow_sklearn_20191217_091908
state: QUEUED


Job [stackoverflow_sklearn_20191217_091908] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe stackoverflow_sklearn_20191217_091908

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs stackoverflow_sklearn_20191217_091908


In [62]:
%%bash
gcloud ai-platform jobs stream-logs $JOB_NAME

ERROR: (gcloud.ai-platform.jobs.stream-logs) argument JOB: Must be specified.
Usage: gcloud ai-platform jobs stream-logs JOB [optional flags]
  optional flags may be  --allow-multiline-logs | --help | --polling-interval |
                         --task-name

For detailed information on this command and its flags, run:
  gcloud ai-platform jobs stream-logs --help


CalledProcessError: Command 'b'gcloud ai-platform jobs stream-logs $JOB_NAME\n'' returned non-zero exit status 2.

# Training on the AI Platform (Hyperparameter Tuning)

In [None]:
%%bash

RUNTIME_VERSION="1.14"
PYTHON_VERSION="3.5"
REGION="us-central1"
JOB_NAME=stackoverflow_sklearn_$(date +"%Y%m%d_%H%M%S")
JOB_DIR="gs://${BUCKET_NAME}/stackoverflow/sklearn/${JOBNAME}"
HPTUNING_CONFIG="$DIR_PROJ/deployment/hp-tuning/sklearn/hyperparam.yaml"
TRAINER_PACKAGE_PATH="$DIR_PROJ/src/model/sklearn/trainer"
MAIN_TRAINER_MODULE="trainer.task"

gcloud ai-platform jobs submit training $JOB_NAME \
  --job-dir $JOB_DIR \
  --package-path $TRAINER_PACKAGE_PATH \
  --module-name $MAIN_TRAINER_MODULE \
  --region $REGION \
  --runtime-version=$RUNTIME_VERSION \
  --python-version=$PYTHON_VERSION \
  --config $HPTUNING_CONFIG \
  -- \
  --project_id $PROJECT_ID \
  --max_nb_label 1000 \
  --frac 0.01