In [1]:
!nvidia-smi

Fri Aug 19 07:13:05 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Imports

In [2]:
%%capture
!pip install transformers
!pip install pytorch-lightning

In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
import re

import random
import torch

def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed()

import warnings
warnings.simplefilter('ignore')

import gc

In [4]:
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/signate_student_cup_2022
train_path = './data/train.csv'
test_path = './data/test.csv'
submit_path = './data/submit_sample.csv'

/content/drive/MyDrive/signate_student_cup_2022


In [6]:
job_flags = ['Data scientist', 'Machine learning engineer','Software engineer','Consultant']

In [7]:
def text_cleaning(texts):
    clean_texts = []
    num_lines = []
    num_words = []
    words_chunk = []
    for text in texts:
        clean_lines = []
        # print(text, "\n")
        lines = text.split(r"</li>")
        for line in lines:
            line = remove_tag(line)
            # print(line)
            #バックスラッシュをスペースに置き換え
            clean_line = re.sub(r'[\\]', '', line)
            # clean_line = re.sub(r'[/]', ' and ', line)# test
            # print(clean_line)
            clean_line = clean_line.strip()
            # print(clean_line, "\n")
            clean_line = clean_line + ('' if clean_line.endswith('.') else '.')
            if len(clean_line)!=1:
                # print(clean_line, "\n")
                clean_lines.append(clean_line)
        # print(clean_lines)
        clean_texts.append(' '.join(clean_lines))
        
        num_lines.append(len(clean_lines))
        num_word = len(str(clean_lines).split())
        num_words.append(num_word)
        word_chunk = ((num_word-1)//240)+1
        words_chunk.append(word_chunk)

    return clean_texts, num_lines, num_words, words_chunk

def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

In [8]:
train_data = pd.read_csv(train_path)
train_data['description'], num_lines, num_words, words_chunk = text_cleaning(train_data['description'])
train_data['num_lines'] = num_lines
train_data['num_words'] = num_words
train_data['words_chunk'] = words_chunk
train_data['jobflag'] = train_data['jobflag']-1
train_data.head(20)

Unnamed: 0,id,description,jobflag,num_lines,num_words,words_chunk
0,0,Develop cutting-edge web applications that per...,2,8,112,1
1,1,"Designs and develops high quality, scalable an...",2,15,221,1
2,2,Functions as a point person for Network Strate...,3,9,190,1
3,3,"Work on the technical design, development, rel...",2,5,79,1
4,4,Quantify the resources required for a task/pro...,3,2,33,1
5,5,Participates in standard business and technica...,2,4,49,1
6,6,"Create project plans, establish timelines, and...",3,5,85,1
7,7,"Facilitate pre-sales initiatives, such as live...",3,11,105,1
8,8,Consolidate dashboards across the team and hel...,0,1,17,1
9,9,Maintain and improve existing predictive model...,0,4,53,1


In [9]:
test_data = pd.read_csv(test_path)
test_data['description'], num_lines, num_words, words_chunk = text_cleaning(test_data['description'])
test_data['num_lines'] = num_lines
test_data['num_words'] = num_words
test_data['words_chunk'] = words_chunk
test_data.head(5)

Unnamed: 0,id,description,num_lines,num_words,words_chunk
0,1516,Building decision-making models and proposing ...,2,30,1
1,1517,Educate homeowners on the benefits of solar en...,5,43,1
2,1518,"Design, develop, document, and implement web a...",8,82,1
3,1519,Apply advanced technical expertise and skills ...,6,81,1
4,1520,Project manage and deliver against our roadmap...,4,35,1


In [10]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 7.8 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [11]:
from transformers import pipeline, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('facebook/mbart-large-50-one-to-many-mmt')
mbart_translator = pipeline('translation',
                            model='facebook/mbart-large-50-one-to-many-mmt',
                            src_lang='en_XX', tgt_lang='ja_XX', device=0)

deberta_tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-base')
roberta_tokenizer = AutoTokenizer.from_pretrained('roberta-base')

Downloading tokenizer_config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading sentencepiece.bpe.model:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/717 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [17]:
# MLE descriptionの特徴について 何故F1が低いのか
pd.set_option('display.max_rows', 100)
mle_data = train_data[train_data["jobflag"]==1].reset_index(drop=True)
for r in range(len(mle_data)):
    print(mle_data.loc[r, "description"])
    print(mbart_translator(mle_data.loc[r, "description"])[0]["translation_text"])
    deberta_tokens = deberta_tokenizer.convert_ids_to_tokens(deberta_tokenizer(mle_data.loc[r, "description"]).input_ids)
    roberta_tokens = roberta_tokenizer.convert_ids_to_tokens(roberta_tokenizer(mle_data.loc[r, "description"]).input_ids)
    print(f'deberta_tokens: {deberta_tokens}')
    print(f'roberta_tokens: {roberta_tokens} \n')

Optimize deep learning frameworks like TensorFlow, PyTorch, etc. on AMD GPUs in upstream open-source repositories. Collaborate and interact with internal GPU library teams to analyze and optimize training and inference for deep learning. Work in a distributed computing setting to optimize for both scale-up (multi-GPU) and scale-out (multi-node) systems. Work with cutting-edge compiler technologies. Optimize the entire deep learning pipeline including graph compiler integration.
TensorFlow, PyTorch などの深層学習フレームワークをアップストリームのオープンソースリポジトリの AMD GPU 上で最適化します. 深層学習のためのトレーニングと推論を分析し、最適化するために内部 GPU ライブラリチームと協力し、相互作用します. スケールアップ (マルチGPU) とスケールアウト (マルチノード) システムの両方を最適化するために分散コンピューティング設定で作業します. 最先端のコンパイラテクノロジーで作業します. グラフコンパイラの統合を含む全体の深層学習パイプラインを最適化します.
deberta_tokens: ['[CLS]', 'Opt', 'imize', 'Ġdeep', 'Ġlearning', 'Ġframeworks', 'Ġlike', 'ĠT', 'ensor', 'Flow', ',', 'ĠPy', 'Tor', 'ch', ',', 'Ġetc', '.', 'Ġon', 'ĠAMD', 'ĠGPUs', 'Ġin', 'Ġupstream', 'Ġopen', '-', 'source', 'Ġrepositories', '.', 'ĠColla

Your input_length: 328 is bigger than 0.9 * max_length: 200. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


deberta_tokens: ['[CLS]', 'Coll', 'abor', 'ate', 'Ġwith', 'Ġthe', 'Ġapplication', 'Ġdevelopment', 'Ġteam', 'Ġto', 'Ġintegrate', 'Ġthe', 'Ġcomputer', 'Ġvision', 'Ġmodels', 'Ġwith', 'Ġthe', 'Ġexisting', 'Ġbackend', 'Ġsystems', '.', 'ĠBuild', 'Ġmonitoring', 'Ġand', 'Ġevaluation', 'Ġtools', 'Ġfor', 'Ġreal', '-', 'time', 'Ġoptimization', 'Ġof', 'Ġthe', 'Ġapplication', 'Ġusing', 'Ġhardware', 'Ġacceler', 'ators', '.', 'ĠInsp', 'ire', 'Ġthe', 'Ġentire', 'Ġteam', 'Ġ(', 'including', 'Ġyour', 'Ġcross', '-', 'functional', ')', 'Ġpartners', 'Ġby', 'Ġbringing', 'Ġnew', 'Ġideas', 'Ġto', 'Ġthe', 'Ġtable', '.', '[SEP]']
roberta_tokens: ['<s>', 'Coll', 'abor', 'ate', 'Ġwith', 'Ġthe', 'Ġapplication', 'Ġdevelopment', 'Ġteam', 'Ġto', 'Ġintegrate', 'Ġthe', 'Ġcomputer', 'Ġvision', 'Ġmodels', 'Ġwith', 'Ġthe', 'Ġexisting', 'Ġbackend', 'Ġsystems', '.', 'ĠBuild', 'Ġmonitoring', 'Ġand', 'Ġevaluation', 'Ġtools', 'Ġfor', 'Ġreal', '-', 'time', 'Ġoptimization', 'Ġof', 'Ġthe', 'Ġapplication', 'Ġusing', 'Ġhardware', 'Ġ

Your input_length: 422 is bigger than 0.9 * max_length: 200. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


deberta_tokens: ['[CLS]', 'Development', 'Ġof', 'Ġoptimization', 'Ġalgorithms', 'Ġfor', 'ĠML', 'Ġoperators', '/', 'l', 'ayers', 'Ġfor', 'Ġthe', 'ĠQualcomm', 'ĠAI', 'ĠSW', 'Ġstack', '.', 'ĠDevelopment', 'Ġof', 'ĠAI', 'ĠSW', 'Ġstack', 'Ġframework', 'Ġenhancements', 'Ġfor', 'Ġoptimal', 'Ġresource', 'Ġusage', 'Ġwhile', 'Ġrunning', 'Ġa', 'Ġneural', 'Ġnetwork', 'Ġon', 'ĠQualcomm', 'Ġhardware', '.', 'ĠEval', 'uating', 'Ġand', 'Ġoptimizing', 'Ġneural', 'Ġnetworks', 'Ġruntime', 'Ġperformance', 'Ġand', 'Ġaccuracy', '.', 'ĠWorking', 'Ġwith', 'Ġcustomer', 'Ġteams', 'Ġto', 'Ġenable', 'Ġstate', 'Ġof', 'Ġthe', 'Ġart', 'Ġnetwork', 'Ġmodels', 'Ġand', 'Ġnew', 'ĠAI', 'ĠSW', 'Ġfeatures', 'Ġto', 'Ġmeet', 'Ġcustomer', 'Ġuse', '-', 'cases', '.', 'ĠCollabor', 'ating', 'Ġwith', 'ĠAI', 'ĠHardware', 'Ġand', 'Ġarchitecture', 'Ġteams', 'Ġto', 'Ġcontinuously', 'Ġimprove', 'Ġour', 'ĠAI', 'Ġsolution', '.', '[SEP]']
roberta_tokens: ['<s>', 'Development', 'Ġof', 'Ġoptimization', 'Ġalgorithms', 'Ġfor', 'ĠML', 'Ġoperator

Your input_length: 197 is bigger than 0.9 * max_length: 200. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


deberta_tokens: ['[CLS]', 'You', 'Ġwill', 'Ġbe', 'Ġanalyzing', 'Ġresult', 'Ġsets', 'Ġto', 'Ġidentify', 'Ġanomalies', 'Ġand', 'Ġhistorical', 'Ġnorms', 'Ġand', 'Ġorder', 'Ġto', 'Ġcontinuously', 'Ġimprove', 'Ġthe', 'Ġquality', 'Ġof', 'Ġthe', 'Ġmodel', "'s", 'Ġoutput', 'Ġand', 'Ġtheir', 'Ġusefulness', 'Ġin', 'Ġsolving', 'Ġreal', 'Ġlife', 'Ġbusiness', 'Ġproblems', '.', '[SEP]']
roberta_tokens: ['<s>', 'You', 'Ġwill', 'Ġbe', 'Ġanalyzing', 'Ġresult', 'Ġsets', 'Ġto', 'Ġidentify', 'Ġanomalies', 'Ġand', 'Ġhistorical', 'Ġnorms', 'Ġand', 'Ġorder', 'Ġto', 'Ġcontinuously', 'Ġimprove', 'Ġthe', 'Ġquality', 'Ġof', 'Ġthe', 'Ġmodel', "'s", 'Ġoutput', 'Ġand', 'Ġtheir', 'Ġusefulness', 'Ġin', 'Ġsolving', 'Ġreal', 'Ġlife', 'Ġbusiness', 'Ġproblems', '.', '</s>'] 

You will monitor deployed models for model drift and performance, support model retraining and provide incident management support. You will monitor deployed models for model drift and performance, support model retraining and provide incident manag