# How to use PyTorch Lightning DataModule
* https://www.youtube.com/watch?v=e47f__x7KSE

In [1]:
!pip install pytorch-lightning

Successfully installed lightning-utilities-0.10.1 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.99 nvidia-nvtx-cu12-12.1.105 pytorch-lightning-2.2.1 torchmetrics-1.3.1


In [None]:
!pip install transformers

In [3]:
import pandas as pd

# preprocess

In [4]:
df = pd.read_csv('/content/drive/MyDrive/study_DeepLearning/data/geoemotions/goemotions_1.csv')
df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
df.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [6]:
# 時間列の修正
df.created_utc = pd.to_datetime(df.created_utc, unit="s")

In [7]:
df.shape

(70000, 37)

In [9]:
# emotionカテゴリ列名のみ抽出
emotion_categories = df.columns[9:]
emotion_categories

Index(['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
       'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
       'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
       'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
       'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [11]:
from transformers import ElectraTokenizerFast as ElectraTokenizer

MODEL_NAME = "google/electra-base-discriminator"

tokenizer = ElectraTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

In [16]:
text = df.iloc[2,0]
encoding = tokenizer(text)

print(f'Text:      {text}')
print(f'Tokens:    {encoding.tokens()}')
print(f'Token IDs: {encoding["input_ids"]}')

Text:      You do right, if you don't care then fuck 'em!
Tokens:    ['[CLS]', 'you', 'do', 'right', ',', 'if', 'you', 'don', "'", 't', 'care', 'then', 'fuck', "'", 'em', '!', '[SEP]']
Token IDs: [101, 2017, 2079, 2157, 1010, 2065, 2017, 2123, 1005, 1056, 2729, 2059, 6616, 1005, 7861, 999, 102]


In [17]:
texts = []
emotions = []
for comment_id, group in df.groupby("id"):
  texts.append(group.iloc[0].text)
  emotions.append(group[emotion_categories].sum(axis=0).argmax())

In [19]:
text_df = pd.DataFrame({"text": texts, "emotion": emotions})
text_df.head()

Unnamed: 0,text,emotion
0,Fast as [NAME] will carry me. Seriously uptown...,4
1,You blew it. They played you like a fiddle.,27
2,TL;DR No more Superbowls for [NAME]. Get ready...,9
3,So much time saved. Not.,7
4,Emotes have a ridiculous amount of effort put ...,25


In [21]:
import pickle
import re

with open('/content/drive/MyDrive/study_DeepLearning/data/Emoji_Dict.p', 'rb') as fp:
  Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

In [22]:
Emoji_Dict

{'🥇': ':1st_place_medal:',
 '🥈': ':2nd_place_medal:',
 '🥉': ':3rd_place_medal:',
 '🆎': ':AB_button_(blood_type):',
 '🏧': ':ATM_sign:',
 '🅰': ':A_button_(blood_type):',
 '🇦 🇫': ':Afghanistan:',
 '🇦 🇱': ':Albania:',
 '🇩 🇿': ':Algeria:',
 '🇦 🇸': ':American_Samoa:',
 '🇦 🇩': ':Andorra:',
 '🇦 🇴': ':Angola:',
 '🇦 🇮': ':Anguilla:',
 '🇦 🇶': ':Antarctica:',
 '🇦 🇬': ':Antigua_&_Barbuda:',
 '♒': ':Aquarius:',
 '🇦 🇷': ':Argentina:',
 '♈': ':Aries:',
 '🇦 🇲': ':Armenia:',
 '🇦 🇼': ':Aruba:',
 '🇦 🇨': ':Ascension_Island:',
 '🇦 🇺': ':Australia:',
 '🇦 🇹': ':Austria:',
 '🇦 🇿': ':Azerbaijan:',
 '🔙': ':BACK_arrow:',
 '🅱': ':B_button_(blood_type):',
 '🇧 🇸': ':Bahamas:',
 '🇧 🇭': ':Bahrain:',
 '🇧 🇩': ':Bangladesh:',
 '🇧 🇧': ':Barbados:',
 '🇧 🇾': ':Belarus:',
 '🇧 🇪': ':Belgium:',
 '🇧 🇿': ':Belize:',
 '🇧 🇯': ':Benin:',
 '🇧 🇲': ':Bermuda:',
 '🇧 🇹': ':Bhutan:',
 '🇧 🇴': ':Bolivia:',
 '🇧 🇦': ':Bosnia_&_Herzegovina:',
 '🇧 🇼': ':Botswana:',
 '🇧 🇻': ':Bouvet_Island:',
 '🇧 🇷': ':Brazil:',
 '🇮 🇴': ':British_Indian_Ocean_T