# Japanese Finance – DimASR using XLM‑RoBERTa (Python 3.13 Compatible)


In [1]:

import os, json, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA available:', torch.cuda.is_available())

TRAIN='jpn_finance_train_task1.jsonl'
DEV='jpn_finance_dev_task1.jsonl'


CUDA available: False


In [2]:

def read_jsonl(path):
    data=[]
    with open(path,'r',encoding='utf8') as f:
        for line in f:
            line=line.strip()
            if line:
                data.append(json.loads(line))
    print(path,'->',len(data),'records')
    return data

train_json=read_jsonl(TRAIN)
dev_json=read_jsonl(DEV)


jpn_finance_train_task1.jsonl -> 1024 records
jpn_finance_dev_task1.jsonl -> 200 records


In [3]:

def build_train_df(data):
    rows=[]
    for ex in data:
        for p in ex['Aspect_VA']:
            v,a=p['VA'].split('#')
            rows.append({
                'ID':ex['ID'],
                'Text':ex['Text'],
                'Aspect':p['Aspect'],
                'valence':float(v),
                'arousal':float(a)
            })
    print('TRAIN rows:',len(rows))
    return pd.DataFrame(rows)

def build_dev_df(data):
    rows=[]
    for ex in data:
        for asp in ex['Aspect']:
            rows.append({'ID':ex['ID'],'Text':ex['Text'],'Aspect':asp})
    print('DEV rows:',len(rows))
    return pd.DataFrame(rows)

train_df=build_train_df(train_json)
dev_df=build_dev_df(dev_json)

train_df.head(), dev_df.head()


TRAIN rows: 1672
DEV rows: 319


(                          ID  \
 0  finance_aspect_va_train_1   
 1  finance_aspect_va_train_2   
 2  finance_aspect_va_train_3   
 3  finance_aspect_va_train_4   
 4  finance_aspect_va_train_5   
 
                                                 Text Aspect  valence  arousal  
 0  平成27年４月の火災により生産を休止していた苫小牧第一きのこセンターが、工場を再建し、平成2...  ブナシメジ     6.00     5.00  
 1  また、改修のため一時生産を休止しておりました広川きのこセンターにおきまして、平成28年９月上...    きのこ     6.00     5.00  
 2      春から夏にかけましては個人消費の低迷などにより、きのこの価格は厳しい状況で推移いたしました    きのこ     3.00     6.00  
 3  反面、秋から冬にかけましては天候不順などによる野菜価格の高騰により、きのこの価格は堅調に推移...    きのこ     5.75     3.17  
 4      海外きのこ事業におきましては、各子会社が稼働率を高めたことにより、生産量は増加いたしました   各子会社     6.00     5.00  ,
                         ID                                               Text  \
 0  finance_aspect_va_dev_1  2025年４月１日から2025年５月31日までの間に、新株予約権の行使により、発行済株式総数...   
 1  finance_aspect_va_dev_1  2025年４月１日から2025年５月31日までの間に、新株予約権の行使により、発行済株式総数...   
 2  finance_aspect_va_dev_1  2025年４月１日から2025年５月31日までの間に、新株予約権の行使により

In [4]:

MODEL='xlm-roberta-base'
tokenizer=AutoTokenizer.from_pretrained(MODEL)

def combine(t,a): return f"{t} [ASP] {a}"

class DS(Dataset):
    def __init__(self,df,train=True):
        self.df=df; self.train=train
    def __len__(self): return len(self.df)
    def __getitem__(self,idx):
        r=self.df.iloc[idx]
        enc=tokenizer(combine(r['Text'],r['Aspect']),truncation=True,
                      max_length=128,return_tensors='pt')
        out={k:v.squeeze(0) for k,v in enc.items()}
        out['ID']=r['ID']; out['Aspect']=r['Aspect']
        if self.train:
            out['labels']=torch.tensor([r['valence'],r['arousal']],
                                       dtype=torch.float32)
        return out

train_ds=DS(train_df,True)
dev_ds=DS(dev_df,False)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [5]:

collator=DataCollatorWithPadding(tokenizer)

def collate(batch):
    IDs=[x['ID'] for x in batch]
    ASP=[x['Aspect'] for x in batch]
    for x in batch:
        x.pop('ID'); x.pop('Aspect')
    pad=collator(batch)
    pad['ID']=IDs; pad['Aspect']=ASP
    return pad

train_loader=DataLoader(train_ds,batch_size=8,shuffle=True,collate_fn=collate)
dev_loader=DataLoader(dev_ds,batch_size=16,shuffle=False,collate_fn=collate)

len(train_loader), len(dev_loader)


(209, 20)

In [6]:

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.base=AutoModel.from_pretrained(MODEL)
        hidden=self.base.config.hidden_size
        self.reg=nn.Linear(hidden,2)
    def forward(self,i,m):
        out=self.base(input_ids=i,attention_mask=m)
        cls=out.last_hidden_state[:,0]
        return self.reg(cls)

model=Model().to(device)
opt=torch.optim.AdamW(model.parameters(),lr=2e-5)
loss_fn=nn.MSELoss()


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [7]:

EPOCHS=4
for ep in range(EPOCHS):
    model.train(); tot=0
    for b in tqdm(train_loader,desc=f'Epoch {ep+1}'):
        ids=b['input_ids'].to(device)
        mask=b['attention_mask'].to(device)
        y=b['labels'].to(device)

        pred=model(ids,mask)
        loss=loss_fn(pred,y)

        opt.zero_grad(); loss.backward(); opt.step()
        tot+=loss.item()
    print('Epoch',ep+1,'Loss =', tot/len(train_loader))


Epoch 1:   0%|          | 0/209 [00:00<?, ?it/s]

Epoch 1 Loss = 2.5934260933878317


Epoch 2:   0%|          | 0/209 [00:00<?, ?it/s]

Epoch 2 Loss = 1.01426491840034


Epoch 3:   0%|          | 0/209 [00:00<?, ?it/s]

Epoch 3 Loss = 0.8908519041880466


Epoch 4:   0%|          | 0/209 [00:00<?, ?it/s]

Epoch 4 Loss = 0.6186575140251497


In [8]:

model.eval()
preds=[]
with torch.no_grad():
    for b in tqdm(dev_loader,desc='Inference'):
        ids=b['input_ids'].to(device)
        mask=b['attention_mask'].to(device)
        logits=model(ids,mask).cpu().numpy()

        for i,(ID,A) in enumerate(zip(b['ID'],b['Aspect'])):
            v,a=logits[i]
            preds.append((ID,A,f"{v:.2f}#{a:.2f}"))

len(preds)


Inference:   0%|          | 0/20 [00:00<?, ?it/s]

319

In [9]:

OUT='pred_jpn_finance.jsonl'
sub={}

for ID,A,VA in preds:
    sub.setdefault(ID,[]).append({"Aspect":A,"VA":VA})

with open(OUT,'w',encoding='utf8') as f:
    for ex in dev_json:
        f.write(json.dumps({
            "ID":ex["ID"],
            "Aspect_VA":sub.get(ex["ID"],[])
        }, ensure_ascii=False) + "\n")

os.path.exists(OUT), os.path.getsize(OUT)


(True, 26342)