In [None]:
x2 = df["code_body"] + df["title"]
y2 = df["tags"].apply(lambda row: " ".join(row))

In [None]:
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.2, stratify=y, random_state=30)

In [None]:
estimators2 = [
    ("tfidf", TfidfVectorizer(tokenizer=lambda string: string.split())),
    ("clf", OneVsRestClassifier(MultinomialNB(), n_jobs=1)),
]
parameters2 = {
    "tfidf__min_df": np.arange(10, 30, 10),
    "tfidf__max_df": np.arange(0.75, 0.9, 0.05),
    "tfidf__ngram_range": [(1, 1), (2, 2), (3, 3)],
    "clf__estimator__alpha": np.arange(0.2, 1, 0.2),
}

In [None]:
p2 = Pipeline(estimators2)
grid2 = RandomizedSearchCV(
    p2, param_distributions=parameters2, n_iter=20, cv=3, n_jobs=-1
)

In [None]:
score2 = grid2.fit(x2_train, y2_train)

In [None]:
y2_pred = grid2.predict(x2_test)

In [None]:
print(classification_report(y2_test, y2_pred))

In [None]:
cm = confusion_matrix(y2_test, y2_pred)
evaluation.plot_confusion_matrix(cm=cm, classes=range(1, 6), normalize=True)

In [None]:
print("Accuracy:", accuracy_score(y2_test, y2_pred))
print("Balanced accuracy:", balanced_accuracy_score(y2_test, y2_pred))
print("Cohen's Cappa Coefficient", cohen_kappa_score(y2_test, y2_pred))

In [None]:
x2 = df["code_body"]
y2 = MultiLabelBinarizer().fit_transform(df["tags"].tolist())

In [None]:
x2_train, x2_test, y2_train, y2_test = train_test_split(
    x2, y2, test_size=0.2, random_state=30
)

In [None]:
estimators = [
    ("tfidf", TfidfVectorizer(tokenizer=lambda string: string.split())),
    ("clf", OneVsRestClassifier(SGDClassifier(n_jobs=-1), n_jobs=-1)),
]
parameters = {
    "tfidf__min_df": np.arange(0.01, 0.05, 0.01),
    "tfidf__max_df": np.arange(0.75, 0.9, 0.05),
    "tfidf__ngram_range": [(1, 1), (2, 2), (3, 3)],
    "clf__estimator__alpha": (1e-2, 1e-3),
}

In [None]:
p2 = Pipeline(estimators)
grid2 = RandomizedSearchCV(
    p, param_distributions=parameters, n_iter=20, cv=3, scoring="roc_auc", n_jobs=-1
)

In [None]:
score2 = grid.fit(x2_train, y2_train)

In [None]:
y2_pred = grid.predict(x_test)

In [None]:
print(classification_report(y2_test, y2_pred))
print(accuracy_score(y2_test, y2_pred))

## Training locally

In [None]:
%%bash
export PYTHONPATH=${PYTHONPATH}:${PYTHONPATH}/src/model/sklearn/

python -m trainer.task \
--job-dir gs://${BUCKET_NAME}/stackoverflow/sklearn \
--project_id $PROJECT_ID \
--frac 0.001

In [None]:
%%bash
export PYTHONPATH=${PYTHONPATH}:${PYTHONPATH}/src/model/sklearn/

python -m trainer.task \
--job-dir gs://${BUCKET_NAME}/stackoverflow/sklearn \
--project_id $PROJECT_ID \
--WE_min_df 0.3 \
--WE_max_df 1.0 \
--FT_norm 'l2' \
--M_alpha 0.56218373775482178 

## Training using AI Platform training

In [None]:
%%bash

RUNTIME_VERSION="1.14"
PYTHON_VERSION="3.5"
JOB_NAME=stackoverflow_sklearn_$(date +"%Y%m%d_%H%M%S")
JOB_DIR="gs://${BUCKET_NAME}/stackoverflow/sklearn/${JOBNAME}"
JOB_NAME=stackoverflow_sklearn_$(date +"%Y%m%d_%H%M%S")
gcloud ai-platform jobs submit training $JOB_NAME \
  --job-dir $JOB_DIR \
  --package-path $DIR_PROJ/src/model/sklearn/trainer \
  --module-name trainer.task \
  --region us-central1 \
  --runtime-version=$RUNTIME_VERSION \
  --python-version=$PYTHON_VERSION \
  -- \
  --project_id $PROJECT_ID \
  --frac 0.002

In [None]:
%%bash
gcloud ai-platform jobs stream-logs $JOB_NAME

## Training and hyper-parameters using AI Platform training

In [None]:
%%bash

RUNTIME_VERSION="1.14"
PYTHON_VERSION="3.5"
SCALE_TIER=STANDARD_1
REGION="us-central1"
JOB_NAME=stackoverflow_sklearn_$(date +"%Y%m%d_%H%M%S")
JOB_DIR="gs://${BUCKET_NAME}/stackoverflow/sklearn/${JOBNAME}"
HPTUNING_CONFIG="$DIR_PROJ/hp-tuning/sklearn/hyperparam.yaml"
TRAINER_PACKAGE_PATH="$DIR_PROJ/src/model/sklearn/trainer"
MAIN_TRAINER_MODULE="trainer.task"


gcloud ai-platform jobs submit training $JOB_NAME \
  --job-dir $JOB_DIR \
  --package-path $TRAINER_PACKAGE_PATH \
  --module-name $MAIN_TRAINER_MODULE \
  --region $REGION \
  --runtime-version=$RUNTIME_VERSION \
  --python-version=$PYTHON_VERSION \
  --scale-tier $SCALE_TIER \
  --config $HPTUNING_CONFIG \
  -- \
  --project_id $PROJECT_ID

# Local Tests

In [3]:
import src.model.sklearn_raphael.trainer.model as model

In [None]:
model.train_and_evaluate(0.2, 0.4, 0.75, 20, 'l2', 0.2)