In [5]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



# Imports

In [6]:
%%capture
!pip install transformers
!pip install pytorch-lightning

In [7]:
import gc
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
import re

import random
import torch

def set_seed(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed()

import warnings
warnings.simplefilter('ignore')

In [8]:
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [9]:
%cd /content/drive/MyDrive/signate_student_cup_2022
train_path = './data/train.csv'
test_path = './data/test.csv'
submit_path = './data/submit_sample.csv'

/content/drive/MyDrive/signate_student_cup_2022


In [10]:
job_flags = ['Data scientist', 'Machine learning engineer','Software engineer','Consultant']

In [11]:
def text_cleaning(texts):
    clean_texts = []
    num_lines = []
    num_words = []
    words_chunk = []
    for text in texts:
        clean_lines = []
        # print(text, "\n")
        lines = text.split(r"</li>")
        for line in lines:
            line = remove_tag(line)
            # print(line)
            #バックスラッシュをスペースに置き換え
            clean_line = re.sub(r'[\\]', '', line)
            # clean_line = re.sub(r'[/]', ' and ', line)# test
            # print(clean_line)
            clean_line = clean_line.strip()
            # print(clean_line, "\n")
            clean_line = clean_line + ('' if clean_line.endswith('.') else '.')
            if len(clean_line)!=1:
                # print(clean_line, "\n")
                clean_lines.append(clean_line)
        # print(clean_lines)
        clean_texts.append(' '.join(clean_lines))
        
        num_lines.append(len(clean_lines))
        num_word = len(str(clean_lines).split())
        num_words.append(num_word)
        word_chunk = ((num_word-1)//240)+1
        words_chunk.append(word_chunk)

    return clean_texts, num_lines, num_words, words_chunk

def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

In [12]:
train_data = pd.read_csv(train_path)
train_data['description'], num_lines, num_words, words_chunk = text_cleaning(train_data['description'])
train_data['num_lines'] = num_lines
train_data['num_words'] = num_words
train_data['words_chunk'] = words_chunk
train_data['jobflag'] = train_data['jobflag']-1
train_data.head(20)

Unnamed: 0,id,description,jobflag,num_lines,num_words,words_chunk
0,0,Develop cutting-edge web applications that per...,2,8,112,1
1,1,"Designs and develops high quality, scalable an...",2,15,221,1
2,2,Functions as a point person for Network Strate...,3,9,190,1
3,3,"Work on the technical design, development, rel...",2,5,79,1
4,4,Quantify the resources required for a task/pro...,3,2,33,1
5,5,Participates in standard business and technica...,2,4,49,1
6,6,"Create project plans, establish timelines, and...",3,5,85,1
7,7,"Facilitate pre-sales initiatives, such as live...",3,11,105,1
8,8,Consolidate dashboards across the team and hel...,0,1,17,1
9,9,Maintain and improve existing predictive model...,0,4,53,1


In [13]:
test_data = pd.read_csv(test_path)
test_data['description'], num_lines, num_words, words_chunk = text_cleaning(test_data['description'])
test_data['num_lines'] = num_lines
test_data['num_words'] = num_words
test_data['words_chunk'] = words_chunk
test_data.head(5)

Unnamed: 0,id,description,num_lines,num_words,words_chunk
0,1516,Building decision-making models and proposing ...,2,30,1
1,1517,Educate homeowners on the benefits of solar en...,5,43,1
2,1518,"Design, develop, document, and implement web a...",8,82,1
3,1519,Apply advanced technical expertise and skills ...,6,81,1
4,1520,Project manage and deliver against our roadmap...,4,35,1


In [14]:
submit_df = pd.read_csv(submit_path)
submit_df.head(5)

Unnamed: 0,1516,1
0,1517,1
1,1518,1
2,1519,1
3,1520,1
4,1521,1


In [24]:
# kfold=8, epoch=10
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from torchmetrics import F1Score

jobflags = train_data['jobflag']
cv_scores = []
f1_func = F1Score(num_classes=4, average="macro")

stratified_kf = StratifiedKFold(n_splits=4) # n_splitは任意の数で
for i, (tr_idx, va_idx) in enumerate(stratified_kf.split(train_data, jobflags)):
  tra_data, val_data = train_data.iloc[tr_idx].reset_index(drop=True), train_data.iloc[va_idx].reset_index(drop=True)
  print(f'# =================== Start {i+1} Fold =================== # ')
  print(f'training_data shape: {tra_data.shape}')
  print(f'validation_data shape: {val_data.shape}')

  count_vect = CountVectorizer()
  X_train_counts = count_vect.fit_transform(tra_data["description"]).todense()
  X_valid_counts = count_vect.transform(val_data["description"]).todense()

  y_train = np.array(tra_data["jobflag"])# np.array()
  y_valid = np.array(val_data["jobflag"])# np.array()

  classifier = GaussianNB() # 実装
  classifier.fit(X_train_counts, y_train)

  y_valid_pred = classifier.predict(X_valid_counts)
  f1_score = f1_func(torch.tensor(y_valid_pred), torch.tensor(y_valid))
  print(f'Valid F1 score = {f1_score}')  
  cv_scores.append(f1_score)

  # del model.pretrained_model, model.hidden, model.multi_hidden, model.classifier
  # del tra_data, val_data, carrers_data_module, model, trainer
  # gc.collect()

print(f'# =================== End of CV =================== # ')
print(f'cv scores: {cv_scores}')
print(f'Final CV = {np.mean(cv_scores)}')


training_data shape: (1137, 6)
validation_data shape: (379, 6)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[3 3 1 ... 0 2 0]
Valid F1 score = 0.5852930545806885
training_data shape: (1137, 6)
validation_data shape: (379, 6)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[2 2 3 ... 0 2 0]
Valid F1 score = 0.5568780899047852
training_data shape: (1137, 6)
validation_data shape: (379, 6)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[2 2 3 ... 0 2 0]
Valid F1 score = 0.551064133644104
training_data shape: (1137, 6)
validation_data shape: (379, 6)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[2 2 3 ... 0 0 0]
Valid F1 score = 0.507513165473938
cv scores: [tensor(0.5853), tensor(0.5569), tensor(0.5