# kaggleとcolaboでの利用について

https://zenn.dev/currypurin/scraps/e01410c6529e8e0d3af9

# Setup colab/kaggle Notebook Enviromnemt

### Confirm platform

In [20]:
def confirm_platform():
    import sys
    import os

    if 'google.colab' in sys.modules:
        return 'colab'
    elif 'KAGGLE_URL_BASE' in set(os.environ.keys()):
        return 'kaggle'

platform = confirm_platform()
print(f"**Info :: Platform is {platform}")

**Info :: Platform is colab


In [21]:
!pwd

/content


### Utility for setup env

In [22]:
def setup_drive_service():
    from google.colab import auth
    from googleapiclient.discovery import build
    auth.authenticate_user()
    drive_service = build("drive", "v3")

    return drive_service


def setup_kaggle_env_on_colab(drive_service=None):
    import io
    from googleapiclient.http import MediaIoBaseDownload

    results = (
        drive_service.files()
        .list(q="name = 'kaggle.json'", fields="files(id)")
        .execute()
    )
    kaggle_api_key = results.get("files", [])

    filename = "/root/.kaggle/kaggle.json"
    os.makedirs(os.path.dirname(filename), exist_ok=True)

    request = drive_service.files().get_media(fileId=kaggle_api_key[0]["id"])
    fh = io.FileIO(filename, "wb")
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
        print("Download %d%%." % int(status.progress() * 100))
    os.chmod(filename, 600)


def setup_github_env_on_colab():

    from google.colab import drive
    drive.mount("./drive")

    !mkdir /root/.ssh
    !chmod 600 /root/.ssh
    !cp drive/MyDrive/.ssh/id_rsa_github /root/.ssh/id_rsa
    !cp drive/MyDrive/.ssh/id_rsa_github.pub /root/.ssh/id_rsa.pub
    !ssh-keyscan -t rsa github.com >> /root/.ssh/known_hosts
    !chmod 600 /root/.ssh/id_rsa

    if not os.path.isdir("./baseline_my_utility"):
        !git clone git@github.com:tfukuda675/baseline_my_utility.git
    else:
        !cd ./baseline_my_utility;git pull;cd ../

    if not os.path.isdir("./baseline_my_ml_models"):
        !git clone git@github.com:tfukuda675/baseline_my_ml_models.git
    else:
        !cd ./baseline_my_ml_models;git pull;cd ../



### run setup env

In [23]:
import os
import sys

if platform == "colab":
    drive_service = setup_drive_service()
    setup_kaggle_env_on_colab(drive_service=drive_service)
    setup_github_env_on_colab()
    sys.path.append('./baseline_my_utility')
    sys.path.append('./baseline_my_ml_models')  

elif platform == "kaggle":
    sys.path.append('../usr/lib/baseline_my_utility')
    sys.path.append('../usr/lib/baseline_my_ml_models')

Download 100%.
Drive already mounted at ./drive; to attempt to forcibly remount, call drive.mount("./drive", force_remount=True).
mkdir: cannot create directory ‘/root/.ssh’: File exists
# github.com:22 SSH-2.0-babeld-01bfc857
Already up to date.
Already up to date.


# Setup Python Environment

### Install library

In [24]:
!pip install ginza > /dev/null 2>&1
!pip install -U ginza ja-ginza  > /dev/null 2>&1
!python -m spacy download ja_core_news_sm > /dev/null 2>&1
!pip install emoji > /dev/null 2>&1
!pip install transformers[ja] > /dev/null 2>&1
import pkg_resources, imp
imp.reload(pkg_resources)

<module 'pkg_resources' from '/usr/local/lib/python3.7/dist-packages/pkg_resources/__init__.py'>

### Import library

In [25]:
import os
import sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import gc

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import layers
from tensorflow.keras import backend as K


#      ____________________
#____/    NLP                   \___________________
#
import emoji
import ginza
import spacy
import gensim
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


#      ____________________
#____/    My Libs               \___________________
#

import importlib
import baseline_my_utility
import baseline_my_ml_models

importlib.reload(baseline_my_utility)
importlib.reload(baseline_my_ml_models)
from baseline_my_utility import reduce_mem_usage
from baseline_my_utility import tweet_clean_text, tweet_prepare_emoji, tweet_prepare_pair, tweet_transformer_data, plot_acc, plot_err
from baseline_my_utility import bert_train
from baseline_my_utility import bert_valid


**Info :: CUDA available. Use AMP function.


# Setup input data from kaggle and GCS

### google cloud strage

In [26]:
from google.colab import auth
auth.authenticate_user()

In [27]:
project_id = 'kaggle-circle'
#!gsutil config set project {project_id}   # or !gsutil config

In [28]:
!mkdir tweet_db_daily
!mkdir satelite_image
!gsutil -m cp -r gs://kxtweetana_disk/tweet_db_daily/2022-02/*.sqlite3 ./tweet_db_daily
!gsutil -m cp -r gs://kxtweetana_disk/satelite_image/skysat/*.tif ./satelite_image

mkdir: cannot create directory ‘tweet_db_daily’: File exists
mkdir: cannot create directory ‘satelite_image’: File exists
Copying gs://kxtweetana_disk/tweet_db_daily/2022-02/tweets_db_2022-02-02.sqlite3...
Copying gs://kxtweetana_disk/tweet_db_daily/2022-02/tweets_db_2022-02-01.sqlite3...
Copying gs://kxtweetana_disk/tweet_db_daily/2022-02/tweets_db_2022-02-05.sqlite3...
Copying gs://kxtweetana_disk/tweet_db_daily/2022-02/tweets_db_2022-02-04.sqlite3...
Copying gs://kxtweetana_disk/tweet_db_daily/2022-02/tweets_db_2022-02-07.sqlite3...
Copying gs://kxtweetana_disk/tweet_db_daily/2022-02/tweets_db_2022-02-10.sqlite3...
Copying gs://kxtweetana_disk/tweet_db_daily/2022-02/tweets_db_2022-02-03.sqlite3...
Copying gs://kxtweetana_disk/tweet_db_daily/2022-02/tweets_db_2022-02-09.sqlite3...
Copying gs://kxtweetana_disk/tweet_db_daily/2022-02/tweets_db_2022-02-06.sqlite3...
Copying gs://kxtweetana_disk/tweet_db_daily/2022-02/tweets_db_2022-02-08.sqlite3...
Copying gs://kxtweetana_disk/tweet_db_

### kaggle dataset

In [29]:
!kaggle kernels output tfukuda675/xpace-bert-binary-classification-with-pseudo-data -p ./xpace-bert-binary-classification-with-pseudo-data

Output file downloaded to ./xpace-bert-binary-classification-with-pseudo-data/bert_model/cl-tohoku/bert-base-japanese-whole-word-masking/config.json
Output file downloaded to ./xpace-bert-binary-classification-with-pseudo-data/bert_model/cl-tohoku/bert-base-japanese-whole-word-masking/pytorch_model.bin
Output file downloaded to ./xpace-bert-binary-classification-with-pseudo-data/model_fold0.pth
Output file downloaded to ./xpace-bert-binary-classification-with-pseudo-data/model_fold1.pth
Output file downloaded to ./xpace-bert-binary-classification-with-pseudo-data/model_fold2.pth
Kernel log downloaded to ./xpace-bert-binary-classification-with-pseudo-data/xpace-bert-binary-classification-with-pseudo-data.log 


# Settings

In [30]:
debug = True
reduce_data = False
reduce_data_size = int(10000)


# Utility

In [31]:
def change_data_type(df):
    ## add your modify
    return df


def data_clean(df):
    
    ## do your work

    return df

def set_seed(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    print(f"Seed set to: {seed}")