In [6]:
# *torch
from pickletools import optimize
# from sched import scheduler
import torch
import torch.backends.cudnn as cudnn
from torch.optim import lr_scheduler as scheduler
from torch.nn.utils.rnn import pad_sequence
from torch.nn import functional as F
from torch import nn
from torch.utils.data import DataLoader

# *transformers
from transformers import MBartForConditionalGeneration, MBartTokenizer, MBartConfig


import utils as utils


# *basic
import os
import time
import shutil
import json, datetime
import numpy as np
from collections import OrderedDict
from tqdm import tqdm
import yaml
import random
import wandb
import copy
from pathlib import Path
import math
import sys
from typing import Iterable, Optional
from loguru import logger

from sacrebleu.metrics import BLEU, CHRF, TER

# *timm
from timm.optim import create_optimizer
from timm.scheduler import create_scheduler
from timm.utils import NativeScaler
from timm.loss import SoftTargetCrossEntropy
from timm.optim import AdamW

# visualization
from torchvision.utils import save_image, make_grid
from PIL import Image
import argparse
from hpman.m import _
import hpargparse

# global definition
from definition import *

import gzip
import pickle
import torch


In [46]:

from prep_args import * 
parser = argparse.ArgumentParser('Visual-Language-Pretraining (VLP) V2 scripts', parents=[get_args_parser()])

hpargparse.bind(parser, _)
args = parser.parse_args()
args

Namespace(batch_size=16, clip_grad=None, config='./config_gloss_free_CSL daily.yaml', cooldown_epochs=10, decay_epochs=30, decay_rate=0.1, device='cuda', dist_url='env://', entity=None, epochs=80, eval=False, finetune='c:\\Users\\User\\AppData\\Roaming\\jupyter\\runtime\\kernel-v2-37492a5B4THCMXA7j.json', hp_detail=False, hp_exit=False, hp_list=None, hp_load=None, hp_save=None, hp_serial_format='auto', input_size=224, local_rank=0, log_all=False, loss_lambda=1.0, lr=0.001, lr_noise=None, lr_noise_pct=0.67, lr_noise_std=1.0, min_lr=1e-08, momentum=0.9, noise_rate=0.15, noise_type='omit_last', num_workers=10, opt='adamw', opt_betas=None, opt_eps=1e-09, output_dir='', patience_epochs=10, pin_mem=True, project='VLP', random_shuffle=False, resize=256, resume='', sched='cosine', seed=0, start_epoch=0, training_refurbish=True, warmup_epochs=0, warmup_lr=1e-06, weight_decay=0.0, world_size=1)

In [10]:
with open(args.config, 'r+', encoding='utf-8') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

config

{'name': 'GFSLT-VLP CSL-Daily',
 'data': {'train': '../../CSL-Daily/sentence_label/processed/labels_train.pkl',
  'dev': '../../CSL-Daily/sentence_label/processed/labels_dev.pkl',
  'test': '../../CSL-Daily/sentence_label/processed/labels_test.pkl',
  'img_path': '../../CSL-Daily/sentence/frames_512x512',
  'max_length': 300},
 'training': {'wandb': 'disabled', 'scale_embedding': False},
 'model': {'transformer': './pretrain_models/CSL/MBart_trimmed',
  'visual_encoder': './pretrain_models/CSL/mytran',
  'sign_proj': True}}

In [4]:
from transformers import MBart50Tokenizer, MBartForConditionalGeneration

# Load the MBART tokenizer
tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
# Example: Chinese sentences from the CSL-daily dataset
texts = ["这是一个例子句子。", "这是另一个例子句子。"]

# Tokenize the Chinese text data
tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
print (tokenized_texts)
input_ids = tokenized_texts['input_ids']
print(input_ids)
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
output = model.generate(input_ids, max_length=50)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': tensor([[250004,      6, 100013, 101676,  27683,   1344,     30,      2,      1],
        [250004,      6,   8513,  83757, 101676,  27683,   1344,     30,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[250004,      6, 100013, 101676,  27683,   1344,     30,      2,      1],
        [250004,      6,   8513,  83757, 101676,  27683,   1344,     30,      2]])


Downloading model.safetensors: 100%|██████████| 2.44G/2.44G [01:20<00:00, 30.5MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading generation_config.json: 100%|██████████| 261/261 [00:00<00:00, 65.3kB/s]


这是一个例子句子。


In [60]:
import importlib
import data_classes
importlib.reload(data_classes)
from data_classes import * 

train_dataset = S2T_Dataset(tokenizer, config, args, 'train', training_refurbish = False)
print(type(train_dataset.data_list))

<class 'list'>


In [41]:
train_dataset[0][1].shape

['000000.jpg', '000001.jpg', '000002.jpg', '000003.jpg', '000004.jpg', '000005.jpg', '000006.jpg', '000007.jpg', '000008.jpg', '000009.jpg', '000010.jpg', '000011.jpg', '000012.jpg', '000013.jpg', '000014.jpg', '000015.jpg', '000016.jpg', '000017.jpg', '000018.jpg', '000019.jpg', '000020.jpg', '000021.jpg', '000022.jpg', '000023.jpg', '000024.jpg', '000025.jpg', '000026.jpg', '000027.jpg', '000028.jpg', '000029.jpg', '000030.jpg', '000031.jpg', '000032.jpg', '000033.jpg', '000034.jpg', '000035.jpg', '000036.jpg', '000037.jpg', '000038.jpg', '000039.jpg', '000040.jpg', '000041.jpg', '000042.jpg', '000043.jpg', '000044.jpg', '000045.jpg', '000046.jpg', '000047.jpg', '000048.jpg', '000049.jpg', '000050.jpg', '000051.jpg']


torch.Size([52, 3, 224, 224])

In [63]:
import prep_dataloaders
importlib.reload(prep_dataloaders)
from prep_dataloaders import * 
trainloader, devloader, testloader = create_dataloaders(config, args)

Creating datasets:
#total train set: 18401.
#total dev set: 1077.
#total test set: 1176.
