In [1]:
from PIL import Image
import requests
from transformers import ChineseCLIPProcessor, ChineseCLIPModel, Trainer,TrainingArguments
import torch
from datasets import load_dataset,Dataset
from pathlib import Path
import pandas as pd 
import numpy as np 
from tqdm import tqdm 

In [2]:
model_name_or_path = "OFA-Sys/chinese-clip-vit-large-patch14"
model = ChineseCLIPModel.from_pretrained(model_name_or_path)
processor = ChineseCLIPProcessor.from_pretrained(model_name_or_path)

In [3]:
text_str = ['一个汽车', '广东脆皮烧鸭做得美不美 这几点广式烧鸭做法秘诀很重要', '开心麻花大电影定档 《夏洛特烦恼》逗比老清新',"赢了这一场硬仗,篮网可以安心等欧文回来了"]
processor(text=text_str)

{'input_ids': [[101, 671, 702, 3749, 6756, 102], [101, 2408, 691, 5546, 4649, 4173, 7890, 976, 2533, 5401, 679, 5401, 6821, 1126, 4157, 2408, 2466, 4173, 7890, 976, 3791, 4908, 6394, 2523, 7028, 6206, 102], [101, 2458, 2552, 7937, 5709, 1920, 4510, 2512, 2137, 3440, 517, 1909, 3821, 4294, 4172, 2630, 518, 6856, 3683, 5439, 3926, 3173, 102], [101, 6617, 749, 6821, 671, 1767, 4801, 801, 117, 5074, 5381, 1377, 809, 2128, 2552, 5023, 3616, 3152, 1726, 3341, 749, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [4]:
image_str = "bigdata/image_data/test-31115.jpg"
image_input = Image.open(image_str)
processor(images=image_input)


{'pixel_values': [array([[[ 0.14932826,  0.14932826,  0.14932826, ..., -0.55139625,
         -0.5659947 , -0.5805931 ],
        [ 0.14932826,  0.14932826,  0.14932826, ..., -0.55139625,
         -0.5659947 , -0.5805931 ],
        [ 0.14932826,  0.14932826,  0.14932826, ..., -0.55139625,
         -0.55139625, -0.5659947 ],
        ...,
        [-0.20103405, -0.12804192,  0.07633615, ..., -1.0331444 ,
         -0.7995695 , -0.6973805 ],
        [-0.24482933, -0.47840413, -0.5805931 , ..., -0.9893491 ,
         -0.94555384, -0.78497106],
        [-0.15723877, -0.43460885, -0.60978997, ..., -0.7703726 ,
         -0.8579632 , -0.78497106]],

       [[ 1.0843711 ,  1.0543556 ,  1.0393478 , ...,  0.55909926,
          0.54409146,  0.5290837 ],
        [ 1.0543556 ,  1.0543556 ,  1.0393478 , ...,  0.55909926,
          0.54409146,  0.5290837 ],
        [ 1.0543556 ,  1.0393478 ,  1.0393478 , ...,  0.55909926,
          0.55909926,  0.54409146],
        ...,
        [-0.10124261, -0.04121154,  

In [5]:
def tokenizer_text(examples) :
    res = processor(text=examples['text'],max_length=64, padding="max_length", return_tensors="pt", truncation=True)
    examples['input_ids'] = res['input_ids']
    examples['attention_mask'] = res['attention_mask']
    return examples

def transform_images(examples):
    images = [Image.open(i) for i in examples['image_path']]
    images = [processor(images=i, return_tensors="pt")['pixel_values'] for i in images]
    examples['pixel_values'] = images
    return examples


dataset = Dataset.from_pandas(df=pd.read_csv("bigdata/clean_train_test/train.csv"))
dataset = dataset.train_test_split(test_size=0.0002)
dataset = dataset.map(
    function=tokenizer_text,
    batched=True
)
# dataset = dataset.map(
#     function=transform_images,
#     batched=True
# )

dataset.set_transform(transform=transform_images)

dataset


  0%|          | 0/1346 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['image_path', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1345373
    })
    test: Dataset({
        features: ['image_path', 'text', 'input_ids', 'attention_mask'],
        num_rows: 270
    })
})

In [6]:
# dataset['train'][2]['input_ids']
dataset['train'][3]['pixel_values'].shape

torch.Size([1, 3, 224, 224])

In [7]:
a = set(torch.tensor(dataset['train'][i]['input_ids']).shape for i in tqdm(range(200)))
a

100%|██████████| 200/200 [00:07<00:00, 28.25it/s]


{torch.Size([64])}

In [8]:
b = set(torch.tensor(dataset['train'][i]['attention_mask']).shape for i in tqdm(range(200)))
b

100%|██████████| 200/200 [00:02<00:00, 77.88it/s]


{torch.Size([64])}

In [9]:
torch.stack([torch.tensor(dataset['train'][0]['input_ids']),
             torch.tensor(dataset['train'][0]['input_ids'])]).shape

torch.Size([2, 64])

In [10]:
dataset['train'][0]['pixel_values'].squeeze(0).shape

torch.Size([3, 224, 224])

In [11]:
torch.stack([dataset['train'][0]['pixel_values'].squeeze(0),
             dataset['train'][0]['pixel_values'].squeeze(0),]).shape

torch.Size([2, 3, 224, 224])

In [12]:
# for i in tqdm(range(dataset['train'].__len__())):
#     try:
#         image = dataset['train'][i]['image_path']
#         image = Image.open(image)
#         processor(images=image, return_tensors="pt")
#     except Exception as e:
#         print(dataset['train'][i]['image_path'])
#         break

    

In [14]:
# dataset['train'][5374]#['image_path']

In [15]:


image = dataset['train'][5374]['image_path']
image = Image.open(image)
processor(images=image, return_tensors="pt")

{'pixel_values': tensor([[[[ 0.2515,  0.4705,  0.4559,  ...,  0.0179,  0.0033,  0.0033],
          [ 0.3683,  0.5873,  0.6457,  ...,  0.0033, -0.0113, -0.0113],
          [ 0.3975,  0.4705,  0.6019,  ...,  0.0179,  0.0617,  0.0763],
          ...,
          [ 0.8792,  0.7041,  0.4121,  ...,  1.1858,  1.2296,  1.1712],
          [ 0.5581,  0.4413,  0.2953,  ...,  1.1712,  1.1858,  1.2004],
          [-0.0988, -0.1280, -0.1572,  ...,  1.1128,  1.1566,  1.1712]],

         [[ 0.3340,  0.5591,  0.5441,  ...,  0.0488,  0.0338,  0.0338],
          [ 0.4540,  0.6792,  0.7392,  ...,  0.0338,  0.0188,  0.0188],
          [ 0.4841,  0.5591,  0.6942,  ...,  0.0488,  0.0939,  0.1089],
          ...,
          [ 1.1594,  0.9793,  0.6792,  ...,  1.2344,  1.2795,  1.2194],
          [ 0.7842,  0.6642,  0.5141,  ...,  1.2194,  1.2344,  1.2495],
          [ 0.0789,  0.0488,  0.0338,  ...,  1.1594,  1.2044,  1.2194]],

         [[ 0.5675,  0.7808,  0.7666,  ...,  0.1835,  0.1693,  0.1693],
          [ 0

In [17]:
def collate_fn(examples):
    pixel_values = torch.stack([i['pixel_values'].squeeze(0) for i in examples])
    input_ids = torch.tensor([example["input_ids"] for example in examples], dtype=torch.long)#torch.stack([torch.tensor(i, dtype=torch.long) for i in examples['input_ids']])
    attention_mask = torch.tensor([example["attention_mask"] for example in examples], dtype=torch.long)#torch.stack([torch.tensor(i) for i in examples['attention_mask']])
    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "return_loss": True,
    }


train_argument = TrainingArguments(
    output_dir="clip_chinese_02",
    per_device_train_batch_size=28,
    per_device_eval_batch_size=28,
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_steps=1000,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=1000,
    fp16=True,
    remove_unused_columns=False,
    save_total_limit=4

)



trainer = Trainer(
    model=model,
    args=train_argument,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=collate_fn,
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 1345373
  Num Epochs = 1
  Instantaneous batch size per device = 28
  Total train batch size (w. parallel, distributed & accumulation) = 56
  Gradient Accumulation steps = 2
  Total optimization steps = 24025
  Number of trainable parameters = 406233089


  0%|          | 0/24025 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 270
  Batch size = 28


{'loss': 3.3329, 'learning_rate': 0.0004978699183320599, 'epoch': 0.04}


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to clip_chinese_02\checkpoint-1000
Configuration saved in clip_chinese_02\checkpoint-1000\config.json


{'eval_loss': 3.3026041984558105, 'eval_runtime': 9.1508, 'eval_samples_per_second': 29.506, 'eval_steps_per_second': 1.093, 'epoch': 0.04}


Model weights saved in clip_chinese_02\checkpoint-1000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 270
  Batch size = 28


{'loss': 3.3321, 'learning_rate': 0.0004915075250885755, 'epoch': 0.08}


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to clip_chinese_02\checkpoint-2000
Configuration saved in clip_chinese_02\checkpoint-2000\config.json


{'eval_loss': 3.3026041984558105, 'eval_runtime': 3.4482, 'eval_samples_per_second': 78.302, 'eval_steps_per_second': 2.9, 'epoch': 0.08}


Model weights saved in clip_chinese_02\checkpoint-2000\pytorch_model.bin
Deleting older checkpoint [clip_chinese_02\checkpoint-20] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 270
  Batch size = 28


{'loss': 3.3321, 'learning_rate': 0.000481033946311067, 'epoch': 0.12}


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to clip_chinese_02\checkpoint-3000
Configuration saved in clip_chinese_02\checkpoint-3000\config.json


{'eval_loss': 3.3026041984558105, 'eval_runtime': 3.3923, 'eval_samples_per_second': 79.592, 'eval_steps_per_second': 2.948, 'epoch': 0.12}


Model weights saved in clip_chinese_02\checkpoint-3000\pytorch_model.bin
Deleting older checkpoint [clip_chinese_02\checkpoint-40] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 270
  Batch size = 28


{'loss': 3.3322, 'learning_rate': 0.0004666070773952711, 'epoch': 0.17}


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to clip_chinese_02\checkpoint-4000
Configuration saved in clip_chinese_02\checkpoint-4000\config.json


{'eval_loss': 3.3026041984558105, 'eval_runtime': 3.3887, 'eval_samples_per_second': 79.676, 'eval_steps_per_second': 2.951, 'epoch': 0.17}


Model weights saved in clip_chinese_02\checkpoint-4000\pytorch_model.bin
Deleting older checkpoint [clip_chinese_02\checkpoint-60] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 270
  Batch size = 28


{'loss': 3.3321, 'learning_rate': 0.00044850157434820096, 'epoch': 0.21}


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to clip_chinese_02\checkpoint-5000
Configuration saved in clip_chinese_02\checkpoint-5000\config.json


{'eval_loss': 3.3026041984558105, 'eval_runtime': 3.3772, 'eval_samples_per_second': 79.949, 'eval_steps_per_second': 2.961, 'epoch': 0.21}


Model weights saved in clip_chinese_02\checkpoint-5000\pytorch_model.bin
Deleting older checkpoint [clip_chinese_02\checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 270
  Batch size = 28


{'loss': 3.3321, 'learning_rate': 0.000426990388338636, 'epoch': 0.25}


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to clip_chinese_02\checkpoint-6000
Configuration saved in clip_chinese_02\checkpoint-6000\config.json


{'eval_loss': 3.3026041984558105, 'eval_runtime': 3.4124, 'eval_samples_per_second': 79.122, 'eval_steps_per_second': 2.93, 'epoch': 0.25}


Model weights saved in clip_chinese_02\checkpoint-6000\pytorch_model.bin
Deleting older checkpoint [clip_chinese_02\checkpoint-2000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 270
  Batch size = 28


{'loss': 3.3321, 'learning_rate': 0.0004024830452650919, 'epoch': 0.29}


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to clip_chinese_02\checkpoint-7000
Configuration saved in clip_chinese_02\checkpoint-7000\config.json


{'eval_loss': 3.3026041984558105, 'eval_runtime': 3.7669, 'eval_samples_per_second': 71.676, 'eval_steps_per_second': 2.655, 'epoch': 0.29}


Model weights saved in clip_chinese_02\checkpoint-7000\pytorch_model.bin
Deleting older checkpoint [clip_chinese_02\checkpoint-3000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 270
  Batch size = 28


{'loss': 3.4755, 'learning_rate': 0.00037537729147642094, 'epoch': 0.33}


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to clip_chinese_02\checkpoint-8000
Configuration saved in clip_chinese_02\checkpoint-8000\config.json


{'eval_loss': 27.78020477294922, 'eval_runtime': 3.3844, 'eval_samples_per_second': 79.777, 'eval_steps_per_second': 2.955, 'epoch': 0.33}


Model weights saved in clip_chinese_02\checkpoint-8000\pytorch_model.bin
Deleting older checkpoint [clip_chinese_02\checkpoint-4000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 270
  Batch size = 28


{'loss': 3.3596, 'learning_rate': 0.00034610484869133153, 'epoch': 0.37}


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to clip_chinese_02\checkpoint-9000
Configuration saved in clip_chinese_02\checkpoint-9000\config.json


{'eval_loss': 3.301671266555786, 'eval_runtime': 3.3943, 'eval_samples_per_second': 79.546, 'eval_steps_per_second': 2.946, 'epoch': 0.37}


Model weights saved in clip_chinese_02\checkpoint-9000\pytorch_model.bin
Deleting older checkpoint [clip_chinese_02\checkpoint-5000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 270
  Batch size = 28


{'loss': 3.3321, 'learning_rate': 0.00031519144087211273, 'epoch': 0.42}


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to clip_chinese_02\checkpoint-10000
Configuration saved in clip_chinese_02\checkpoint-10000\config.json


{'eval_loss': 3.3030922412872314, 'eval_runtime': 3.3937, 'eval_samples_per_second': 79.56, 'eval_steps_per_second': 2.947, 'epoch': 0.42}


Model weights saved in clip_chinese_02\checkpoint-10000\pytorch_model.bin
Deleting older checkpoint [clip_chinese_02\checkpoint-6000] due to args.save_total_limit


In [None]:
dataset['train'][5374]#['image_path']