In [1]:
!pip install transformers



In [2]:
from transformers import pipeline
#---------------------------------------------------#
#                     NLP TASKS                     #
#---------------------------------------------------#

'''
1. Text Classification: Assigning a category to a piece of text.
Sentiment Analysis
Topic Classification
Spam Detection '''

classifier = pipeline("text-classification")

'''
2. Token Classification: Assigning labels to individual tokens in a sequence.
Named Entity Recognition (NER)
Part-of-Speech Tagging
'''

token_classifier = pipeline("token-classification")

'''
3. Question Answering: Extracting an answer from a given context based on a question.
'''
question_answerer = pipeline("question-answering")

'''
4. Text Generation: Generating text based on a given prompt.
Language Modeling
Story Generation

'''

text_generator = pipeline("text-generation")

'''
5. Summarization: Condensing long documents into shorter summaries.
'''

summarizer = pipeline("summarization")

'''
Translation: Translating text from one language to another.
'''

translator = pipeline("translation",
                      model="Helsinki-NLP/opus-mt-en-fr")

'''
6. Text2Text Generation: General-purpose text transformation, including summarization and translation.
'''

text2text_generator = pipeline("text2text-generation")

'''
7. Fill-Mask: Predicting the masked token in a sequence.
'''

fill_mask = pipeline("fill-mask")

'''
8. Feature Extraction: Extracting hidden states or features from text.
'''

feature_extractor = pipeline("feature-extraction")

'''
9. Sentence Similarity: Measuring the similarity between two sentences.
'''


#---------------------------------------------------#
#             Computer Vision TASKS                 #
#---------------------------------------------------#

'''
1. Image Classification: Classifying the main content of an image.

'''

image_classifier = pipeline("image-classification")

'''
2. Object Detection: Identifying objects within an image and their bounding boxes.
'''

object_detector = pipeline("object-detection")

'''
3. Image Segmentation: Segmenting different parts of an image into classes.
'''

image_segmenter = pipeline("image-segmentation")

'''
4. Image Generation: Generating images from textual descriptions (using DALL-E or similar models).
'''

#---------------------------------------------------#
#             Speech Processing TASKS               #
#---------------------------------------------------#

'''
1. utomatic Speech Recognition (ASR): Converting spoken language into text.
'''

speech_recognizer = pipeline("automatic-speech-recognition")

'''
2. Speech Translation: Translating spoken language from one language to another.
3. Audio Classification: Classifying audio signals into predefined categories.
'''

#---------------------------------------------------#
#                   Multimodal TASKS                #
#---------------------------------------------------#

'''
1. Image Captioning: Generating a textual description of an image.
'''
image_captioner = pipeline("image-to-text")
'''
2. Visual Question Answering (VQA): Answering questions about the content of an image.
'''

#---------------------------------------------------#
#                     Other TASKS                   #
#---------------------------------------------------#
'''
1. Table Question Answering: Answering questions based on tabular data.
'''
table_qa = pipeline("table-question-answering")

'''
2. Document Question Answering: Extracting answers from documents like PDFs.

'''
doc_qa = pipeline("document-question-answering")
'''
3. Time Series Forecasting: Predicting future values in time series data (not directly supported in the main Transformers library but available through extensions).
'''

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cuda:0


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to google-t5/t5-base and revision a9723ea (https://huggingface.co/google-t5/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to distilbert/distilroberta-base and revision fb53ab8 (https://huggingface.co/distilbert/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to distilbert/distilbert-base-cased and revision 6ea8117 (https://huggingface.co/distilbert/distilbert-base-cased).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to google/vit-base-patch16-224 and revision 3f49326 (https://huggingface.co/google/vit-base-patch16-224).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Device set to use cuda:0
No model was supplied, defaulted to facebook/detr-resnet-50 and revision 1d5f47b (https://huggingface.co/facebook/detr-resnet-50).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


preprocessor_config.json:   0%|          | 0.00/290 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda:0
No model was supplied, defaulted to facebook/detr-resnet-50-panoptic and revision d53b52a (https://huggingface.co/facebook/detr-resnet-50-panoptic).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/172M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/172M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/detr-resnet-50-panoptic were not used when initializing DetrForSegmentation: ['detr.model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'detr.model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'detr.model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'detr.model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForSegmentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForSegmentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


preprocessor_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to facebook/wav2vec2-base-960h and revision 22aad52 (https://huggingface.co/facebook/wav2vec2-base-960h).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to ydshieh/vit-gpt2-coco-en and revision 5bebf1e (https://huggingface.co/ydshieh/vit-gpt2-coco-en).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/982M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to google/tapas-base-finetuned-wtq and revision e3dde19 (https://huggingface.co/google/tapas-base-finetuned-wtq).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json: 0.00B [00:00, ?B/s]

TAPAS models are not usable since `tensorflow_probability` can't be loaded. It seems you have `tensorflow_probability` installed with the wrong tensorflow version. Please try to reinstall it following the instructions here: https://github.com/tensorflow/probability.


pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/490 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to impira/layoutlm-document-qa and revision beed3c4 (https://huggingface.co/impira/layoutlm-document-qa).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/511M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/315 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


'\n3. Time Series Forecasting: Predicting future values in time series data (not directly supported in the main Transformers library but available through extensions).\n'

# Sentiment Analysis

In [3]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
result = classifier("I was so not happy with the last Mission Impossible Movie")
print(result)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


[{'label': 'NEGATIVE', 'score': 0.9997795224189758}]


In [4]:
pipeline(task = "sentiment-analysis")("I was confused with the Barbie Movie")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


[{'label': 'NEGATIVE', 'score': 0.9992005228996277}]

In [5]:
pipeline(task = "sentiment-analysis")\
                                      ("Everyday lots of LLMs papers are published about LLMs Evlauation. \
                                      Lots of them Looks very Promising. \
                                      I am not sure if we CAN actually Evaluate LLMs. \
                                      There is still lots to do.\
                                      Don't you think?")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.9964345693588257}]

In [6]:
pipeline(task = "sentiment-analysis", model="facebook/bart-large-mnli")\
                                      ("Everyday lots of LLMs papers are published about LLMs Evlauation. \
                                      Lots of them Looks very Promising. \
                                      I am not sure if we CAN actually Evaluate LLMs. \
                                      There is still lots to do.\
                                      Don't you think?")

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


[{'label': 'neutral', 'score': 0.7693331837654114}]

# Batch Senteniment Analysis

In [7]:
classifier = pipeline(task = "sentiment-analysis")

task_list = ["I really like Autoencoders, best models for Anomaly Detection", \
            "I am sure if we CAN actually Evaluate LLMs.", \
            "PassiveAgressive is the name of a Linear Regression Model that so many people do not know.",\
            "I love long Meetings."]
classifier(task_list)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.9978686571121216},
 {'label': 'NEGATIVE', 'score': 0.9972532391548157},
 {'label': 'NEGATIVE', 'score': 0.9983084201812744},
 {'label': 'POSITIVE', 'score': 0.9994850158691406}]

In [8]:
classifier = pipeline(task = "sentiment-analysis", model = "SamLowe/roberta-base-go_emotions")

task_list = ["I really like Autoencoders, best models for Anomaly Detection", \
            "I am not sure if we CAN actually Evaluate LLMs.", \
            "PassiveAgressive is the name of a Linear Regression Model that so many people do not know. It is pretty funny name for a Regression Model.",\
            "I hate long Meetings.",\
             "I Love you",\
             "what is this"]
classifier(task_list)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cuda:0
  return forward_call(*args, **kwargs)


[{'label': 'admiration', 'score': 0.7406536340713501},
 {'label': 'confusion', 'score': 0.9066852331161499},
 {'label': 'amusement', 'score': 0.9083253145217896},
 {'label': 'anger', 'score': 0.7870621085166931},
 {'label': 'love', 'score': 0.9517903327941895},
 {'label': 'neutral', 'score': 0.588139533996582}]

In [9]:
# Use a pipeline as a high-level helper
from transformers import pipeline

text_generator = pipeline("text-generation", model="distilbert/distilgpt2")
generated_text = text_generator("Today is a rainy day in London",
                                truncation=True,
                                num_return_sequences = 2)
print("Generated_text:\n ", generated_text[0]['generated_text'])

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated_text:
  Today is a rainy day in London and a storm is coming.



It is also the second time in nearly six years that a woman has died in the capital.

There is a similar pattern in the UK, with people dying in the capital, London and the south-east.
One woman, who died in the mid-1990s in South Wales, died in the central capital of London on Sunday.
The death comes as the Met has launched a new investigation into what has been a massive and terrifying event leading to the deaths of more than 1,000 people in the city.
Ms Leitch, who worked with police in London, said: "We have seen a number of deaths at a time.
"There are a couple of very similar events happening in London, but they are very different.
"There are a couple of very similar incidents at a time and the same is possible in the capital."


# Question Answering

In [10]:
from transformers import pipeline

qa_model = pipeline("question-answering")
question = "what is my job?"
context = "I am devloping AI model with Python"
qa_model(question = question, context = context)


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


{'score': 0.6085911904810928,
 'start': 5,
 'end': 23,
 'answer': 'devloping AI model'}

# Tokenization

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification

In [12]:
model_name2 = "nlptown/bert-base-multilingual-uncased-sentiment"

mymodel2 = AutoModelForSequenceClassification.from_pretrained(model_name2)

mytokenizer2 = AutoTokenizer.from_pretrained(model_name2)

classifier = pipeline("sentiment-analysis", model = mymodel2 , tokenizer = mytokenizer2)

res = classifier("I was so not happy with the Barbie Movie")
print(res)

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


[{'label': '2 stars', 'score': 0.5099301934242249}]


  return forward_call(*args, **kwargs)


In [13]:
from transformers import AutoTokenizer

# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Example text
text = "I was so not happy with the Barbie Movie"

# Tokenize the text
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokens: ['i', 'was', 'so', 'not', 'happy', 'with', 'the', 'barbie', 'movie']


In [14]:
# Convert tokens to input IDs

input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Input IDs:", input_ids)



Input IDs: [1045, 2001, 2061, 2025, 3407, 2007, 1996, 22635, 3185]


In [16]:

# Encode the text (tokenization + converting to input IDs)

encoded_input = tokenizer(text)
print("Encoded Input:", encoded_input)


Encoded Input: {'input_ids': [101, 1045, 2001, 2061, 2025, 3407, 2007, 1996, 22635, 3185, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [17]:
# Decode the text

decoded_output = tokenizer.decode(input_ids)
print("Decoded Output: ", decoded_output)


Decoded Output:  i was so not happy with the barbie movie


In [18]:
from transformers import AutoTokenizer

# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Example text
text = "I was so not happy with the Barbie Movie"

# Tokenize the text
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

# Convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Input IDs:", input_ids)

# Encode the text (tokenization + converting to input IDs)
encoded_input = tokenizer(text)
print("Encoded Input:", encoded_input)

# Decode the text
decoded_output = tokenizer.decode(input_ids)
print("Decode Output: ", decoded_output)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Tokens: ['I', 'was', 'so', 'not', 'happy', 'with', 'the', 'Barbie', 'Movie']
Input IDs: [146, 1108, 1177, 1136, 2816, 1114, 1103, 25374, 8275]
Encoded Input: {'input_ids': [101, 146, 1108, 1177, 1136, 2816, 1114, 1103, 25374, 8275, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Decode Output:  I was so not happy with the Barbie Movie


In [19]:
!pip install datasets



In [20]:
from datasets import load_dataset
dataset = load_dataset('imdb')

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [22]:
dataset["train"][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

# HTML Tag remove

In [26]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

# Apply the function to the 'text' column in each split of the dataset
dataset['train'] = dataset['train'].map(lambda x: {'text': remove_html_tags(x['text'])})
dataset['test'] = dataset['test'].map(lambda x: {'text': remove_html_tags(x['text'])})
dataset['unsupervised'] = dataset['unsupervised'].map(lambda x: {'text': remove_html_tags(x['text'])})

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [27]:
dataset["train"][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, even then it\'s n

# URL remove

In [28]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

# Apply the function to the 'text' column in each split of the dataset
dataset['train'] = dataset['train'].map(lambda x: {'text': remove_html_tags(x['text'])})
dataset['test'] = dataset['test'].map(lambda x: {'text': remove_html_tags(x['text'])})
dataset['unsupervised'] = dataset['unsupervised'].map(lambda x: {'text': remove_html_tags(x['text'])})

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

## Step 3: Preprocess the Data
Tokenize the dataset using the tokenizer associated with the pre-trained model.

In [29]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [30]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [31]:
tokenized_datasets["train"][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, even then it\'s n

## Step 4: Set Up the Training Arguments
Specify the hyperparameters and training settings

In [35]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    eval_strategy ="epoch",     # Evaluate every epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=8,  # Reduced batch size for training
    per_device_eval_batch_size=8,   # Reduced batch size for evaluation
    num_train_epochs=1,              # Number of training epochs
    weight_decay=0.01,               # Strength of weight decay
)
training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=False

## Step 5: Initialize the Model
Load the pre-trained model and define the training proced

In [33]:
from transformers import AutoModelForSequenceClassification, Trainer

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step 6: Train the Model
Fine-tune the pre-trained model on your specific dataset.

In [37]:
# Train the model
# Reinitialize the Trainer to avoid AcceleratorState error
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)
trainer.train()

  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,0.2279,0.220828


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=3125, training_loss=0.27808583740234377, metrics={'train_runtime': 3655.4197, 'train_samples_per_second': 6.839, 'train_steps_per_second': 0.855, 'total_flos': 6577776384000000.0, 'train_loss': 0.27808583740234377, 'epoch': 1.0})

## Step 7: Evaluate the Model
Assess the model's performance on a validation set.

In [38]:
# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.2208280861377716, 'eval_runtime': 758.683, 'eval_samples_per_second': 32.952, 'eval_steps_per_second': 4.119, 'epoch': 1.0}


## Step 8: Save the Fine-Tuned Model
Save the fine-tuned model for later use.

In [39]:
# Save the model
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-tokenizer')

('./fine-tuned-tokenizer/tokenizer_config.json',
 './fine-tuned-tokenizer/special_tokens_map.json',
 './fine-tuned-tokenizer/vocab.txt',
 './fine-tuned-tokenizer/added_tokens.json',
 './fine-tuned-tokenizer/tokenizer.json')

# ArXiv Project

In [40]:
!pip install arxiv

Collecting arxiv
  Downloading arxiv-2.2.0-py3-none-any.whl.metadata (6.3 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arxiv-2.2.0-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=e101e5051d919f56577d5fc858e3865cd3a5b4fb2a56eb0696a8396c5bc4c301
  Stored in directory: /root/.cache/pip/wheels/3b/25/2a/105d6a15df6914f4d15047691c6c28f9052cc1173e40285d03
Successfully built sgmllib3k
Installing collected packag

In [41]:
import arxiv
import pandas as pd

In [42]:
# Query to fetch AI-related papers
query = 'ai OR artificial intelligence OR machine learning'
search = arxiv.Search(query=query, max_results=10, sort_by=arxiv.SortCriterion.SubmittedDate)

# Fetch papers
papers = []
for result in search.results():
    papers.append({
      'published': result.published,
        'title': result.title,
        'abstract': result.summary,
        'categories': result.categories
    })

# Convert to DataFrame
df = pd.DataFrame(papers)

pd.set_option('display.max_colwidth', None)
df.head(10)

  for result in search.results():


Unnamed: 0,published,title,abstract,categories
0,2025-08-04 17:59:38+00:00,MedVLThinker: Simple Baselines for Multimodal Medical Reasoning,"Large Reasoning Models (LRMs) have introduced a new paradigm in AI by\nenabling models to ``think before responding"" via chain-of-thought reasoning.\nHowever, the absence of open and reproducible recipes for building\nreasoning-centric medical LMMs hinders community-wide research, analysis, and\ncomparison. In this paper, we present MedVLThinker, a suite of simple yet\nstrong baselines. Our fully open recipe consists of: (1) systematic data\ncuration for both text-only and image-text medical data, filtered according to\nvarying levels of reasoning difficulty, and (2) two training paradigms:\nSupervised Fine-Tuning (SFT) on distilled reasoning traces and Reinforcement\nLearning with Verifiable Rewards (RLVR) based on final answer correctness.\nAcross extensive experiments on the Qwen2.5-VL model family (3B, 7B) and six\nmedical QA benchmarks, we find that RLVR consistently and significantly\noutperforms SFT. Additionally, under the RLVR framework, a key,\ncounter-intuitive finding is that training on our curated text-only reasoning\ndata provides a more substantial performance boost than training on multimodal\nimage-text data. Our best open 7B model, trained using the RLVR recipe on\ntext-only data, establishes a new state-of-the-art on existing public VQA\nbenchmarks, surpassing all previous open-source medical LMMs. Furthermore,\nscaling our model to 32B achieves performance on par with the proprietary\nGPT-4o. We release all curated data, models, and code to provide the community\nwith a strong, open foundation for future research in multimodal medical\nreasoning.",[cs.CV]
1,2025-08-04 17:58:22+00:00,LOST: Low-rank and Sparse Pre-training for Large Language Models,"While large language models (LLMs) have achieved remarkable performance\nacross a wide range of tasks, their massive scale incurs prohibitive\ncomputational and memory costs for pre-training from scratch. Recent studies\nhave investigated the use of low-rank parameterization as a means of reducing\nmodel size and training cost. In this context, sparsity is often employed as a\ncomplementary technique to recover important information lost in low-rank\ncompression by capturing salient features in the residual space. However,\nexisting approaches typically combine low-rank and sparse components in a\nsimplistic or ad hoc manner, often resulting in undesirable performance\ndegradation compared to full-rank training. In this paper, we propose\n\textbf{LO}w-rank and \textbf{S}parse pre-\textbf{T}raining (\textbf{LOST}) for\nLLMs, a novel method that ingeniously integrates low-rank and sparse structures\nto enable effective training of LLMs from scratch under strict efficiency\nconstraints. LOST applies singular value decomposition to weight matrices,\npreserving the dominant low-rank components, while allocating the remaining\nsingular values to construct channel-wise sparse components to complement the\nexpressiveness of low-rank training. We evaluate LOST on LLM pretraining\nranging from 60M to 7B parameters. Our experiments show that LOST achieves\ncompetitive or superior performance compared to full-rank models, while\nsignificantly reducing both memory and compute overhead. Moreover, Code is\navailable at\n\href{https://github.com/JiaxiLi1/LOST-Low-rank-and-Sparse-Training-for-Large-Language-Models}{LOST\nRepo}",[cs.LG]
2,2025-08-04 17:49:37+00:00,PMGS: Reconstruction of Projectile Motion across Large Spatiotemporal Spans via 3D Gaussian Splatting,"Modeling complex rigid motion across large spatiotemporal spans remains an\nunresolved challenge in dynamic reconstruction. Existing paradigms are mainly\nconfined to short-term, small-scale deformation and offer limited consideration\nfor physical consistency. This study proposes PMGS, focusing on reconstructing\nProjectile Motion via 3D Gaussian Splatting. The workflow comprises two stages:\n1) Target Modeling: achieving object-centralized reconstruction through dynamic\nscene decomposition and an improved point density control; 2) Motion Recovery:\nrestoring full motion sequences by learning per-frame SE(3) poses. We introduce\nan acceleration consistency constraint to bridge Newtonian mechanics and pose\nestimation, and design a dynamic simulated annealing strategy that adaptively\nschedules learning rates based on motion states. Futhermore, we devise a Kalman\nfusion scheme to optimize error accumulation from multi-source observations to\nmitigate disturbances. Experiments show PMGS's superior performance in\nreconstructing high-speed nonlinear rigid motion compared to mainstream dynamic\nmethods.",[cs.CV]
3,2025-08-04 17:43:22+00:00,Open Molecular Crystals 2025 (OMC25) Dataset and Models,"The development of accurate and efficient machine learning models for\npredicting the structure and properties of molecular crystals has been hindered\nby the scarcity of publicly available datasets of structures with property\nlabels. To address this challenge, we introduce the Open Molecular Crystals\n2025 (OMC25) dataset, a collection of over 27 million molecular crystal\nstructures containing 12 elements and up to 300 atoms in the unit cell. The\ndataset was generated from dispersion-inclusive density functional theory (DFT)\nrelaxation trajectories of over 230,000 randomly generated molecular crystal\nstructures of around 50,000 organic molecules. OMC25 comprises diverse chemical\ncompounds capable of forming different intermolecular interactions and a wide\nrange of crystal packing motifs. We provide detailed information on the\ndataset's construction, composition, structure, and properties. To demonstrate\nthe quality and use cases of OMC25, we further trained and evaluated\nstate-of-the-art open-source machine learning interatomic potentials. By making\nthis dataset publicly available, we aim to accelerate the development of more\naccurate and efficient machine learning models for molecular crystals.",[physics.chem-ph]
4,2025-08-04 17:33:41+00:00,D2PPO: Diffusion Policy Policy Optimization with Dispersive Loss,"Diffusion policies excel at robotic manipulation by naturally modeling\nmultimodal action distributions in high-dimensional spaces. Nevertheless,\ndiffusion policies suffer from diffusion representation collapse: semantically\nsimilar observations are mapped to indistinguishable features, ultimately\nimpairing their ability to handle subtle but critical variations required for\ncomplex robotic manipulation. To address this problem, we propose D2PPO\n(Diffusion Policy Policy Optimization with Dispersive Loss). D2PPO introduces\ndispersive loss regularization that combats representation collapse by treating\nall hidden representations within each batch as negative pairs. D2PPO compels\nthe network to learn discriminative representations of similar observations,\nthereby enabling the policy to identify subtle yet crucial differences\nnecessary for precise manipulation. In evaluation, we find that early-layer\nregularization benefits simple tasks, while late-layer regularization sharply\nenhances performance on complex manipulation tasks. On RoboMimic benchmarks,\nD2PPO achieves an average improvement of 22.7% in pre-training and 26.1% after\nfine-tuning, setting new SOTA results. In comparison with SOTA, results of\nreal-world experiments on a Franka Emika Panda robot show the excitingly high\nsuccess rate of our method. The superiority of our method is especially evident\nin complex tasks. Project page: https://guowei-zou.github.io/d2ppo/",[cs.AI]
5,2025-08-04 17:33:36+00:00,CAK: Emergent Audio Effects from Minimal Deep Learning,"We demonstrate that a single 3x3 convolutional kernel can produce emergent\naudio effects when trained on 200 samples from a personalized corpus. We\nachieve this through two key techniques: (1) Conditioning Aware Kernels (CAK),\nwhere output = input + (learned_pattern x control), with a soft-gate mechanism\nsupporting identity preservation at zero control; and (2) AuGAN (Audit GAN),\nwhich reframes adversarial training from ""is this real?"" to ""did you apply the\nrequested value?"" Rather than learning to generate or detect forgeries, our\nnetworks cooperate to verify control application, discovering unique\ntransformations. The learned kernel exhibits a diagonal structure creating\nfrequency-dependent temporal shifts that are capable of producing musical\neffects based on input characteristics. Our results show the potential of\nadversarial training to discover audio transformations from minimal data,\nenabling new approaches to effect design.","[cs.LG, cs.SD, eess.AS]"
6,2025-08-04 17:25:55+00:00,FastCSP: Accelerated Molecular Crystal Structure Prediction with Universal Model for Atoms,"Crystal Structure Prediction (CSP) of molecular crystals plays a central role\nin applications, such as pharmaceuticals and organic electronics. CSP is\nchallenging and computationally expensive due to the need to explore a large\nsearch space with sufficient accuracy to capture energy differences of a few\nkJ/mol between polymorphs. Dispersion-inclusive density functional theory (DFT)\nprovides the required accuracy but its computational cost is impractical for a\nlarge number of putative structures. We introduce FastCSP, an open-source,\nhigh-throughput CSP workflow based on machine learning interatomic potentials\n(MLIPs). FastCSP combines random structure generation using Genarris 3.0 with\ngeometry relaxation and free energy calculations powered entirely by the\nUniversal Model for Atoms (UMA) MLIP. We benchmark FastCSP on a curated set of\n28 mostly rigid molecules, demonstrating that our workflow consistently\ngenerates known experimental structures and ranks them within 5 kJ/mol per\nmolecule of the global minimum. Our results demonstrate that universal MLIPs\ncan be used across diverse compounds without requiring system-specific tuning.\nMoreover, the speed and accuracy afforded by UMA eliminate the need for\nclassical force fields in the early stages of CSP and for final re-ranking with\nDFT. The open-source release of the entire FastCSP workflow significantly\nlowers the barrier to accessing CSP. CSP results for a single system can be\nobtained within hours on tens of modern GPUs, making high-throughput crystal\nstructure prediction feasible for a broad range of scientific applications.","[physics.chem-ph, cs.LG]"
7,2025-08-04 17:25:36+00:00,An Efficient Continuous-Time MILP for Integrated Aircraft Hangar Scheduling and Layout,"Efficient management of aircraft maintenance hangars is a critical\noperational challenge, involving complex, interdependent decisions regarding\naircraft scheduling and spatial allocation. This paper introduces a novel\ncontinuous-time mixed-integer linear programming (MILP) model to solve this\nintegrated spatio-temporal problem. By treating time as a continuous variable,\nour formulation overcomes the scalability limitations of traditional\ndiscrete-time approaches. The performance of the exact model is benchmarked\nagainst a constructive heuristic, and its practical applicability is\ndemonstrated through a custom-built visualization dashboard. Computational\nresults are compelling: the model solves instances with up to 25 aircraft to\nproven optimality, often in mere seconds, and for large-scale cases of up to 40\naircraft, delivers high-quality solutions within known optimality gaps. In all\ntested scenarios, the resulting solutions consistently and significantly\noutperform the heuristic, which highlights the framework's substantial economic\nbenefits and provides valuable managerial insights into the trade-off between\nsolution time and optimality.","[math.OC, cs.AI, cs.CE, 90C11 (Primary), 90B35, 90C27 (Secondary)]"
8,2025-08-04 17:23:14+00:00,Anticipating Decoherence: a Predictive Framework for Enhancing Coherence in Quantum Emitters,"Large-scale quantum systems require optical coherence between distant quantum\ndevices, necessitating spectral indistinguishability. Scalable solid-state\nplatforms offer promising routes to this goal. However, environmental\ndisorders, including dephasing, spectral diffusion, and spin-bath interactions,\ninfluence the emitters' spectra and deteriorate the coherence. Using\nstatistical theory, we identify correlations in spectral diffusion from slowly\nvarying environmental coupling, revealing predictable dynamics extendable to\nother disorders. Importantly, this could enable the development of an\nanticipatory framework for forecasting and decoherence engineering in remote\nquantum emitters. To validate this framework, we demonstrate that a machine\nlearning model trained on limited data can accurately forecast unseen spectral\nbehavior. Realization of such a model on distinct quantum emitters could reduce\nthe spectral shift by factors $\approx$ 2.1 to 15.8, depending on emitter\nstability, compared to no prediction. This work presents, for the first time,\nthe application of anticipatory systems and replica theory to quantum\ntechnology, along with the first experimental demonstration of internal\nprediction that generalizes across multiple quantum emitters. These results\npave the way for real-time decoherence engineering in scalable quantum systems.\nSuch capability could lead to enhanced optical coherence and multi-emitter\nsynchronization, with broad implications for quantum communication,\ncomputation, imaging, and sensing.",[quant-ph]
9,2025-08-04 17:23:00+00:00,Instance-Optimal Uniformity Testing and Tracking,"In the uniformity testing task, an algorithm is provided with samples from an\nunknown probability distribution over a (known) finite domain, and must decide\nwhether it is the uniform distribution, or, alternatively, if its total\nvariation distance from uniform exceeds some input distance parameter. This\nquestion has received a significant amount of interest and its complexity is,\nby now, fully settled. Yet, we argue that it fails to capture many scenarios of\ninterest, and that its very definition as a gap problem in terms of a\nprespecified distance may lead to suboptimal performance.\n To address these shortcomings, we introduce the problem of uniformity\ntracking, whereby an algorithm is required to detect deviations from uniformity\n(however they may manifest themselves) using as few samples as possible, and be\ncompetitive against an optimal algorithm knowing the distribution profile in\nhindsight. Our main contribution is a\n$\operatorname{polylog}(\operatorname{opt})$-competitive uniformity tracking\nalgorithm. We obtain this result by leveraging new structural results on\nPoisson mixtures, which we believe to be of independent interest.","[cs.DS, cs.LG]"


In [43]:
# Example abstract from API
abstract = df['abstract'][0]

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Summarization
summarization_result = summarizer(abstract)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [44]:
summarization_result[0]['summary_text']

'Large Reasoning Models (LRMs) have introduced a new paradigm in AI by enabling models to think before responding. The absence of open and reproducible recipes for building LMMs hinders community-wide research, analysis, and comparison. We present MedVLThinker, a suite of simple yet strong baselines.'