In [1]:
import google.cloud.bigquery as bigquery
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as snsn
import numpy as np
import importlib

In [2]:
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer,
    TfidfTransformer,
)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from skmultilearn.problem_transform import LabelPowerset
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    balanced_accuracy_score,
    cohen_kappa_score,
)

In [3]:
import src.utils.evaluation as evaluation
import src.model.sklearn_raphael.trainer.model as sklearn_v1

In [6]:
import importlib

importlib.reload(evaluation)
importlib.reload(sklearn_v1)

<module 'src.model.sklearn_raphael.trainer.model' from '/home/raphaelprinz92/proj_NLP_text_classification_with_GCP/src/model/sklearn_raphael/trainer/model.py'>

# Training locally

In [8]:
%%bash
export PYTHONPATH=${PYTHONPATH}:${PYTHONPATH}/src/model/sklearn_raphael/

python -m trainer.task \
--job-dir gs://${BUCKET_NAME}/stackoverflow/sklearn \
--project_id $PROJECT_ID \
--max_nb_label 1000 \
--frac 0.2

Process is terminated.


In [7]:
%%bash
export PYTHONPATH=${PYTHONPATH}:${PYTHONPATH}/src/model/sklearn_raphael/

python -m trainer.task \
--job-dir gs://${BUCKET_NAME}/stackoverflow/sklearn \
--project_id $PROJECT_ID \
--frac 0.2 \
--WE_min_df 1. \
--WE_max_df 1. \
--FT_norm 'l2' \

Process is terminated.


## Training using AI Platform training

In [None]:
%%bash

RUNTIME_VERSION="1.14"
PYTHON_VERSION="3.5"
JOB_DIR="gs://${BUCKET_NAME}/stackoverflow/sklearn/${JOBNAME}"
JOB_NAME=stackoverflow_sklearn_$(date +"%Y%m%d_%H%M%S")
CONFIG="$DIR_PROJ/deployment/training/sklearn/custom.yaml"
gcloud ai-platform jobs submit training $JOB_NAME \
  --job-dir $JOB_DIR \
  --package-path $DIR_PROJ/src/model/sklearn/trainer \
  --module-name trainer.task \
  --region us-central1 \
  --runtime-version=$RUNTIME_VERSION \
  --python-version=$PYTHON_VERSION \
  --config $CONFIG \
  -- \
  --project_id $PROJECT_ID \
  --max_nb_label 1000 \
  --frac 0.01

In [None]:
%%bash
gcloud ai-platform jobs stream-logs $JOB_NAME

## Training and hyper-parameters using AI Platform training

In [None]:
%%bash

RUNTIME_VERSION="1.14"
PYTHON_VERSION="3.5"
REGION="us-central1"
JOB_NAME=stackoverflow_sklearn_$(date +"%Y%m%d_%H%M%S")
JOB_DIR="gs://${BUCKET_NAME}/stackoverflow/sklearn/${JOBNAME}"
HPTUNING_CONFIG="$DIR_PROJ/deployment/hp-tuning/sklearn/hyperparam.yaml"
TRAINER_PACKAGE_PATH="$DIR_PROJ/src/model/sklearn/trainer"
MAIN_TRAINER_MODULE="trainer.task"

gcloud ai-platform jobs submit training $JOB_NAME \
  --job-dir $JOB_DIR \
  --package-path $TRAINER_PACKAGE_PATH \
  --module-name $MAIN_TRAINER_MODULE \
  --region $REGION \
  --runtime-version=$RUNTIME_VERSION \
  --python-version=$PYTHON_VERSION \
  --config $HPTUNING_CONFIG \
  -- \
  --project_id $PROJECT_ID \
  --max_nb_label 1000 \
  --frac 0.01

## Testing local CPU and memory usage

In [None]:
import psutil

print("### CPU (count) {}".format(psutil.cpu_count()))
print("### CPU (count) {}".format(psutil.cpu_count(logical=False)))
print("### CPU (count) {}".format(len(psutil.Process().cpu_affinity())))

mem = psutil.virtual_memory()
print("----> memory  ...")
print(mem)
print("### Memory total     {:.2f} Gb".format(mem.total / 1024 ** 3))
print("### Memory percent   {:.2f} %".format(mem.percent))
print("### Memory available {:.2f} Gb".format(mem.available / 1024 ** 3))
print("### Memory used      {:.2f} Gb".format(mem.used / 1024 ** 3))
print("### Memory free      {:.2f} Gb".format(mem.free / 1024 ** 3))
print("### Memory active    {:.2f} Gb".format(mem.active / 1024 ** 3))
print("### Memory inactive  {:.2f} Gb".format(mem.inactive / 1024 ** 3))
print("### Memory buffers   {:.2f} Gb".format(mem.buffers / 1024 ** 3))
print("### Memory cached    {:.2f} Gb".format(mem.cached / 1024 ** 3))
print("### Memory shared    {:.2f} Gb".format(mem.shared / 1024 ** 3))
print("### Memory slab      {:.2f} Gb".format(mem.slab / 1024 ** 3))
print(" ")

In [None]:
import pandas as pd

int_values = [1, 2, 3, 4, 5]
text_values = ["alpha", "beta", "gamma", "delta", "epsilon"]
float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
df = pd.DataFrame(
    {"int_col": int_values, "text_col": text_values, "float_col": float_values}
)

In [None]:
df.info()

In [None]:
sum(df.memory_usage().to_dict().values())

In [None]:
type(log)

In [None]:
import numpy as np

row = 1
feature = 15000
test = np.zeros((row, feature), dtype=int)
total_row = 31e6
print(
    "memory needed for {:,} entries and {:,} features: {:.2f} Gb".format(
        total_row, feature, (total_row * test.nbytes / 1024 ** 3) / row
    )
)

In [None]:
query = """
SELECT
  *
FROM
  `nlp-text-classification.stackoverflow.posts_preprocessed_selection_subset`
"""

In [None]:
client = bigquery.Client()
df = client.query(query).to_dataframe()

In [None]:
df.dtypes

In [None]:
df["tags"][54000]

In [None]:
type(df['tags'][0]))

In [None]:
df["label"] = df["tags"].apply(lambda row: np.asarray(row))

In [None]:
df.dtypes

In [None]:
type(df["label"][0])

In [None]:
df[0:10]["tags"]

In [None]:
from collections import Counter
import operator

tags = df["tags"].sum()
unique_tags = dict(Counter(tags))
unique_tags = sorted(unique_tags.items(), key=operator.itemgetter(1))
unique_tags.reverse()
keep_tags = [x[0] for x in unique_tags][0:10]

In [None]:
nb_label = 100
tags = df["tags"].sum()
unique_tags = dict(Counter(tags))
unique_tags = sorted(unique_tags.items(), key=operator.itemgetter(1))
unique_tags.reverse()
max_nb_label = len(unique_tags) + 1
if nb_label > max_nb_label:
    nb_label = max_nb_label
keep_tags = [x[0] for x in unique_tags][0:nb_label]

In [None]:
len(unique_tags)

In [None]:
keep_tags

In [None]:
def build_tag(row, list_tags):
    # print(row)
    for idx, val in enumerate(row):
        # print('idx',idx)
        # print('val',val)
        if val not in list_tags:
            # print('delete')
            del row[idx]
    # print(row)
    return row

In [None]:
df["tags"][0]

In [None]:
build_tag(df["tags"][54000], keep_tags)

In [None]:
df["tags"] = df["tags"].apply(lambda x: build_tag(x, keep_tags))

In [None]:
print(df[["tags"]])

In [None]:
df["label"] = df["tags"].apply(lambda x: x[0] if len(x) > 0 else "other-tag")

In [None]:
len(df["label"].unique())

In [None]:
len(df["label"].unique())

In [None]:
df["tags"].values

In [None]:
len(df["tags"])

In [None]:
df[["tags"]].to_numpy()

In [None]:
tags = df["tags"].sum()

In [None]:
print(len(df["tags"]))

In [None]:
print(len(tags))

In [None]:
from collections import Counter, OrderedDict

a = dict(Counter(tags))

In [None]:
import operator

sorted_x = sorted(a.items(), key=operator.itemgetter(1))

In [None]:
sorted_x.reverse()

In [None]:
[x[0] for x in sorted_x]

In [None]:
list_items

In [None]:
sorted_x

In [None]:
sorted(a.items(), key=lambda x: x[1])

In [None]:
sorted(a, key=lambda x: a[x])

In [None]:
row = [1, 2, 3, 4]

In [None]:
del row[1, 4]

In [None]:
%env var "5"

In [None]:
%%bash 
echo $var