In [1]:
!pip install -U datasets
!pip install -U accelerate
!pip install -U transformers
!pip install -U huggingface_hub

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.16.0-py3-none-any.whl.metadata (3.0 kB)
Collecting numpy>=1.17 (from datasets)
  Downloading numpy-2.1.1-cp312-cp312-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Using cached pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Using cached tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess (from dat

In [2]:
from datasets import load_dataset

dataset = load_dataset("ilsilfverskiold/tech-keywords-topics-summary")
dataset

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 888/888 [00:00<00:00, 3.38kB/s]
Downloading data: 100%|██████████| 2.06M/2.06M [00:01<00:00, 1.47MB/s]
Downloading data: 100%|██████████| 186k/186k [00:00<00:00, 237kB/s]
Downloading data: 100%|██████████| 190k/190k [00:00<00:00, 283kB/s]
Generating train split: 100%|██████████| 7196/7196 [00:00<00:00, 410798.83 examples/s]
Generating validation split: 100%|██████████| 635/635 [00:00<00:00, 281095.84 examples/s]
Generating test split: 100%|██████████| 635/635 [00:00<00:00, 350214.73 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'text', 'timestamp', 'reactions', 'engagement', 'url', 'text_length', 'keywords', 'topic', 'summary', '__index_level_0__'],
        num_rows: 7196
    })
    validation: Dataset({
        features: ['id', 'source', 'text', 'timestamp', 'reactions', 'engagement', 'url', 'text_length', 'keywords', 'topic', 'summary', '__index_level_0__'],
        num_rows: 635
    })
    test: Dataset({
        features: ['id', 'source', 'text', 'timestamp', 'reactions', 'engagement', 'url', 'text_length', 'keywords', 'topic', 'summary', '__index_level_0__'],
        num_rows: 635
    })
})

In [3]:
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Text: {example['text']}'")
        print(f"'>> Keywords: {example['keywords']}'")


show_samples(dataset)


'>> Text: Driverless car users will not be prosecuted for fatal crashes in UK'
'>> Keywords: Driverless Cars, Legal Issues, UK'

'>> Text: Google is embedding inaudible watermarks right into its AI generated music -'
'>> Keywords: Google, AI Music, Watermarks, Audio Technology'

'>> Text: What are your thoughts on Nextjs performance? Do you agree with this chart? - ( by 10up where Nextjs appears lower than WordPress on core vitals. Couldn’t post the image here due to community rules. But appreciate any other studies and thought you have on this matter.'
'>> Keywords: Next.js, Performance, 10up, WordPress'


In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = 'facebook/bart-large' # go smaller if you can
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

texts = dataset['train']['text']

# Tokenize all texts and find the maximum length (max for BART is 1024 tokens)
max_token_length = max(len(tokenizer.encode(text, truncation=True)) for text in texts)
print(f"The longest text is {max_token_length} tokens long.")



The longest text is 245 tokens long.


In [5]:
def get_feature(batch):
  encodings = tokenizer(batch['text'], text_target=batch['keywords'],
                        max_length=1024, truncation=True)

  encodings = {'input_ids': encodings['input_ids'],
               'attention_mask': encodings['attention_mask'],
               'labels': encodings['labels']}

  return encodings

dataset_pt = dataset.map(get_feature, batched=True)
dataset_pt

Map: 100%|██████████| 7196/7196 [00:00<00:00, 25095.57 examples/s]
Map: 100%|██████████| 635/635 [00:00<00:00, 22322.28 examples/s]
Map: 100%|██████████| 635/635 [00:00<00:00, 22430.94 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'text', 'timestamp', 'reactions', 'engagement', 'url', 'text_length', 'keywords', 'topic', 'summary', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7196
    })
    validation: Dataset({
        features: ['id', 'source', 'text', 'timestamp', 'reactions', 'engagement', 'url', 'text_length', 'keywords', 'topic', 'summary', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 635
    })
    test: Dataset({
        features: ['id', 'source', 'text', 'timestamp', 'reactions', 'engagement', 'url', 'text_length', 'keywords', 'topic', 'summary', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 635
    })
})

In [6]:
columns = ['input_ids', 'labels', 'attention_mask']
dataset_pt.set_format(type='torch', columns=columns)

In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = 'bart_tech_keywords', # rename to what you want it to be called
    num_train_epochs=3, # your choice
    warmup_steps = 500,
    per_device_train_batch_size=4, 
    per_device_eval_batch_size=4,
    weight_decay = 0.01,
    logging_steps = 10,
    evaluation_strategy = 'steps',
    eval_steps=50, 
    save_steps=1e6,
    gradient_accumulation_steps=16 
)

trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer, data_collator=data_collator,
                  train_dataset = dataset_pt['train'], eval_dataset = dataset_pt['validation'])

trainer.train()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  3%|▎         | 10/336 [01:20<37:17,  6.86s/it] 

{'loss': 3.6513, 'grad_norm': 40.85301971435547, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.09}


  6%|▌         | 20/336 [02:41<45:16,  8.60s/it]

{'loss': 3.1626, 'grad_norm': 31.181283950805664, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.18}


  9%|▉         | 30/336 [03:56<35:56,  7.05s/it]

{'loss': 2.3209, 'grad_norm': 20.88770866394043, 'learning_rate': 3e-06, 'epoch': 0.27}


 12%|█▏        | 40/336 [05:01<31:08,  6.31s/it]

{'loss': 1.7205, 'grad_norm': 10.746589660644531, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.36}


 15%|█▍        | 50/336 [06:26<45:20,  9.51s/it]

{'loss': 1.4049, 'grad_norm': 10.60523509979248, 'learning_rate': 5e-06, 'epoch': 0.44}


                                                
 15%|█▍        | 50/336 [07:27<45:20,  9.51s/it] 

{'eval_loss': 1.1728525161743164, 'eval_runtime': 60.2591, 'eval_samples_per_second': 10.538, 'eval_steps_per_second': 2.639, 'epoch': 0.44}


 18%|█▊        | 60/336 [09:01<45:50,  9.97s/it]  

{'loss': 1.3121, 'grad_norm': 7.623536586761475, 'learning_rate': 6e-06, 'epoch': 0.53}


 21%|██        | 70/336 [10:34<40:46,  9.20s/it]

{'loss': 1.2611, 'grad_norm': 10.073939323425293, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.62}


 24%|██▍       | 80/336 [12:34<1:11:13, 16.70s/it]

{'loss': 1.2297, 'grad_norm': 8.070141792297363, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.71}


 27%|██▋       | 90/336 [36:51<17:12:55, 251.93s/it]

{'loss': 1.1682, 'grad_norm': 13.98412036895752, 'learning_rate': 9e-06, 'epoch': 0.8}


 30%|██▉       | 100/336 [1:26:10<6:15:45, 95.53s/it] 

{'loss': 1.1023, 'grad_norm': 8.362699508666992, 'learning_rate': 1e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 100/336 [1:27:04<6:15:45, 95.53s/it]

{'eval_loss': 1.0794211626052856, 'eval_runtime': 54.5605, 'eval_samples_per_second': 11.638, 'eval_steps_per_second': 2.914, 'epoch': 0.89}


 33%|███▎      | 110/336 [1:28:45<47:32, 12.62s/it]  

{'loss': 1.0717, 'grad_norm': 13.899110794067383, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.98}


 36%|███▌      | 120/336 [1:30:14<31:15,  8.68s/it]

{'loss': 1.0577, 'grad_norm': 6.83284854888916, 'learning_rate': 1.2e-05, 'epoch': 1.07}


 39%|███▊      | 130/336 [1:31:33<27:28,  8.00s/it]

{'loss': 1.0065, 'grad_norm': 12.647035598754883, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.16}


 42%|████▏     | 140/336 [1:32:52<25:47,  7.90s/it]

{'loss': 1.0244, 'grad_norm': 7.274231910705566, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.25}


 45%|████▍     | 150/336 [1:34:43<40:22, 13.02s/it]

{'loss': 0.9659, 'grad_norm': 11.281682014465332, 'learning_rate': 1.5e-05, 'epoch': 1.33}


                                                   
 45%|████▍     | 150/336 [1:36:01<40:22, 13.02s/it]

{'eval_loss': 0.9492010474205017, 'eval_runtime': 78.0204, 'eval_samples_per_second': 8.139, 'eval_steps_per_second': 2.038, 'epoch': 1.33}


 48%|████▊     | 160/336 [1:37:43<32:13, 10.99s/it]  

{'loss': 0.9744, 'grad_norm': 9.96319580078125, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.42}


 51%|█████     | 170/336 [1:39:05<23:31,  8.50s/it]

{'loss': 0.9483, 'grad_norm': 10.527077674865723, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.51}


 54%|█████▎    | 180/336 [1:40:30<22:40,  8.72s/it]

{'loss': 0.9721, 'grad_norm': 6.891709804534912, 'learning_rate': 1.8e-05, 'epoch': 1.6}


 57%|█████▋    | 190/336 [1:41:53<20:05,  8.26s/it]

{'loss': 0.9142, 'grad_norm': 10.293627738952637, 'learning_rate': 1.9e-05, 'epoch': 1.69}


 60%|█████▉    | 200/336 [1:43:16<18:33,  8.19s/it]

{'loss': 0.9226, 'grad_norm': 12.827656745910645, 'learning_rate': 2e-05, 'epoch': 1.78}


                                                   
 60%|█████▉    | 200/336 [1:44:13<18:33,  8.19s/it]

{'eval_loss': 0.8965096473693848, 'eval_runtime': 57.0921, 'eval_samples_per_second': 11.122, 'eval_steps_per_second': 2.785, 'epoch': 1.78}


 62%|██████▎   | 210/336 [1:45:33<18:04,  8.61s/it]

{'loss': 0.9299, 'grad_norm': 8.831805229187012, 'learning_rate': 2.1e-05, 'epoch': 1.87}


 65%|██████▌   | 220/336 [1:46:57<16:35,  8.58s/it]

{'loss': 0.9575, 'grad_norm': 14.106292724609375, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.96}


 68%|██████▊   | 230/336 [1:48:24<15:15,  8.63s/it]

{'loss': 0.8384, 'grad_norm': 6.936272144317627, 'learning_rate': 2.3000000000000003e-05, 'epoch': 2.05}


 71%|███████▏  | 240/336 [1:49:48<13:53,  8.68s/it]

{'loss': 0.8157, 'grad_norm': 10.196778297424316, 'learning_rate': 2.4e-05, 'epoch': 2.13}


 74%|███████▍  | 250/336 [1:51:12<12:10,  8.50s/it]

{'loss': 0.8488, 'grad_norm': 7.5270676612854, 'learning_rate': 2.5e-05, 'epoch': 2.22}


                                                   
 74%|███████▍  | 250/336 [1:52:13<12:10,  8.50s/it]

{'eval_loss': 0.8998113870620728, 'eval_runtime': 60.4211, 'eval_samples_per_second': 10.51, 'eval_steps_per_second': 2.632, 'epoch': 2.22}


 77%|███████▋  | 260/336 [1:53:39<11:53,  9.39s/it]

{'loss': 0.8173, 'grad_norm': 8.18798828125, 'learning_rate': 2.6000000000000002e-05, 'epoch': 2.31}


 80%|████████  | 270/336 [1:55:06<09:08,  8.31s/it]

{'loss': 0.8219, 'grad_norm': 7.067136287689209, 'learning_rate': 2.7000000000000002e-05, 'epoch': 2.4}


 83%|████████▎ | 280/336 [1:56:35<08:25,  9.03s/it]

{'loss': 0.8307, 'grad_norm': 6.97703218460083, 'learning_rate': 2.8000000000000003e-05, 'epoch': 2.49}


 86%|████████▋ | 290/336 [1:58:04<06:49,  8.90s/it]

{'loss': 0.8388, 'grad_norm': 9.943134307861328, 'learning_rate': 2.9e-05, 'epoch': 2.58}


 89%|████████▉ | 300/336 [1:59:34<05:28,  9.12s/it]

{'loss': 0.8581, 'grad_norm': 6.239572525024414, 'learning_rate': 3e-05, 'epoch': 2.67}


                                                   
 89%|████████▉ | 300/336 [2:00:37<05:28,  9.12s/it]

{'eval_loss': 0.8912317156791687, 'eval_runtime': 62.9613, 'eval_samples_per_second': 10.086, 'eval_steps_per_second': 2.525, 'epoch': 2.67}


 92%|█████████▏| 310/336 [2:02:21<05:09, 11.91s/it]

{'loss': 0.8445, 'grad_norm': 10.027575492858887, 'learning_rate': 3.1e-05, 'epoch': 2.76}


 95%|█████████▌| 320/336 [2:11:29<19:59, 74.94s/it] 

{'loss': 0.817, 'grad_norm': 4.917409420013428, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.85}


 98%|█████████▊| 330/336 [2:13:27<01:21, 13.51s/it]

{'loss': 0.8061, 'grad_norm': 10.157153129577637, 'learning_rate': 3.3e-05, 'epoch': 2.93}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
100%|██████████| 336/336 [2:14:45<00:00, 24.07s/it]

{'train_runtime': 8085.979, 'train_samples_per_second': 2.67, 'train_steps_per_second': 0.042, 'train_loss': 1.1810229207788194, 'epoch': 2.99}





TrainOutput(global_step=336, training_loss=1.1810229207788194, metrics={'train_runtime': 8085.979, 'train_samples_per_second': 2.67, 'train_steps_per_second': 0.042, 'total_flos': 3582300094562304.0, 'train_loss': 1.1810229207788194, 'epoch': 2.9883268482490273})

In [16]:
trainer.save_model('tech-keywords-extractor_plp')


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [31]:
# !huggingface-cli login --token hf_xxxxxxxxxxxx

# check output of fine tuned model
from transformers import pipeline

pipe = pipeline('summarization', model='tech-keywords-extractor_plp')

text_test = "Apple is launching iphone 16 with Apple Intelligence. It also uses OpenAI ChatGPT"

# test_text=dataset['test'][0]['text']
# keywords = dataset['test'][0]['keywords']
print("the text: ", text_test)
print("generated keywords: ", pipe(text_test))
# print("orginal keywords : ",keywords)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 128, but your input_length is only 21. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)


the text:  Apple is launching iphone 16 with Apple Intelligence. It also uses OpenAI ChatGPT
generated keywords:  [{'summary_text': 'Apple, iphone 16, Apple Intelligence, OpenAI, ChatGPT'}]


In [29]:
# Check back on output of base untrained model

pipe = pipeline('summarization', model = 'facebook/bart-large')

text_test = "Apple is launching iphone 16 with Apple Intelligence. It also uses OpenAI ChatGPT. "

# test_text=dataset['test'][0]['text']
# keywords = dataset['test'][0]['keywords']
print("the text: ", text_test)
print("generated keywords: ", pipe(text_test))
# print("orginal keywords : ",keywords)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 128, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)


the text:  Apple is launching iphone 16 with Apple Intelligence. It also uses OpenAI ChatGPT. 
generated keywords:  [{'summary_text': 'Apple is launching iphone 16 with Apple Intelligence. It also uses OpenAI ChatGPT. '}]


In [30]:
# Check back on output of base model fine tuned on CNN daily news

pipe = pipeline('summarization', model = 'facebook/bart-large-cnn')

text_test = "Apple is launching iphone 16 with Apple Intelligence. It also uses OpenAI ChatGPT. "

# test_text=dataset['test'][0]['text']
# keywords = dataset['test'][0]['keywords']
print("the text: ", text_test)
print("generated keywords: ", pipe(text_test))
# print("orginal keywords : ",keywords)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 142, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)


the text:  Apple is launching iphone 16 with Apple Intelligence. It also uses OpenAI ChatGPT. 
generated keywords:  [{'summary_text': "Apple is launching iphone 16 with Apple Intelligence. It also uses OpenAI ChatGPT. Apple is launching Apple Intelligence with Apple Artificial Intelligence on the iPhone 16. Apple  also using OpenAI chatGPT on the phone. Apple's new iPhone 16 will be released on September 18."}]


In [20]:
for i in range(0, 50):
    text_test = dataset['test'][i]['text']
    keywords = dataset['test'][i]['keywords']
    print("text: ", text_test)
    print("generated keywords: ", pipe(text_test)[0]['summary_text'])
    print("original keywords: ", keywords)

Your max_length is set to 128, but your input_length is only 74. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)


text:  Unleash the Edge: Women’s Faux Leather Zip-Up Motorcycle Jacket — Vintage Grunge Aesthetic Revived - Ride the Rebel Wave: Women’s Faux Leather Zip-Up Motorcycle Jacket — A Fusion of Timeless Edge and Vintage Grunge Aesthetic for…


Your max_length is set to 128, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)


generated keywords:  Women's Faux Leather Zip-Up Motorcycle Jacket, Vintage Grunge Aesthetic
original keywords:  Women's Fashion, Faux Leather, Motorcycle Jacket, Vintage Grunge
text:  Distractions, analytical thinking and falling for fake news


Your max_length is set to 128, but your input_length is only 20. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)


generated keywords:  Distractions, analytical thinking, Fake News
original keywords:  Distractions, Analytical Thinking, Fake News
text:  If you buy a Cybertruck, Tesla says you can't sell it for a year


Your max_length is set to 128, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)


generated keywords:  Cybertruck, Tesla, Auto Sales
original keywords:  Cybertruck, Tesla, Resale Restriction
text:  ODROID-M1S is a $49 single-board PC with RK3566, 64GB eMMC and an M.2 2280 slot


Your max_length is set to 128, but your input_length is only 87. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


generated keywords:  ODROID-M1S, Single-board PC, RK3566, eMMC, M.2 2280
original keywords:  ODROID-M1S, RK3566, Single-Board PC, M.2 Slot
text:  ChatGPT-Admin-Web - One-stop system for shared use of AI within teams and organizationswith |  - English ChatGPT Admin Web CAW GitHub Sponsor / Features V3 V2 + topics: ai-system, chatgpt, gpt-4, llm, llvm, newbing, nextjs, postgresql, prisma, prompt, user-management, webui


Your max_length is set to 128, but your input_length is only 28. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)


generated keywords:  ChatGPT, Admin Web, GitHub, AI, Teams, Organizations
original keywords:  ChatGPT-Admin-Web, AI, ChatGPT, GPT-4, LLM, LLVM, Newbing, Next.js, PostgreSQL, Prisma, User Management, WebUI
text:  Microsoft Azure CTO Headhunted for SDE II Position at Amazontwitter.com/markrussinovich
generated keywords:  Microsoft Azure, CTO, SDE II, Amazon
original keywords:  Microsoft Azure, Microsoft, Azure, CTO, Amazon, Mark Russinovich
text:  Self-Hosting-Guide - Self-Hosting Guide. Learn all about locally hosting  and managing software applications by yourself or your organization. Including Cloud, LLMs, WireGuard, Automation, Home Assistant, and Networking. - Self Hosting Guide A guide for getting started with Self Hosting devices including software and hardware that will make you a better and more efficient Self Hosting. Note: You can easily convert this markdown file to a PDF in VSCode using this handy extension Markdown PDF . Note 2: This guide will constantly be updated with n

Your max_length is set to 128, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)


generated keywords:  Self Hosting, Cloud, LLMs, WireGuard, Automation, Home Assistant, Home Automation
original keywords:  Self Hosting, Cloud, LLMs, WireGuard, Automation, Home Assistant, Networking, Authentication, Docker, Docker Compose, Linux, OAuth, Observability, Open Source, Privacy, Raspberry Pi, Reverse Proxy, Search, SSH
text:  Pendulation is normal - Think of a pendulum. When you push it in one direction, it swings back with opposite force.


Your max_length is set to 128, but your input_length is only 21. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)


generated keywords:  Pendulum, Normal Motion, Physics
original keywords:  Pendulation, Pendulum
text:  ChatGPT Voice for Medium Article Writers and Content Creators - Unleash the Conversation with AI


Your max_length is set to 128, but your input_length is only 62. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)


generated keywords:  ChatGPT Voice, Medium, Article Writers, Content Creators, AI
original keywords:  ChatGPT, Voice, Medium, Content Creation
text:  CookieCloud - CookieCloudCookieCookieLocal storage - CookieCloudCookieCloudCookieCookieLocal storagelocal storageTelegram | Telegram Breaking Change local storage 0.1.5+ cookie local storagecookie{ cookie_data, local_storage_data } remote local


Your max_length is set to 128, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)


generated keywords:  CookieCloud, Telegram, Local Storage, Storagelocal
original keywords:  CookieCloud, Local Storage, Telegram, Breaking Change
text:  Essential Resources to Learn App Development for Apple Vision Pro -


Your max_length is set to 128, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


generated keywords:  App Development, Apple Vision Pro, Apple
original keywords:  App Development, Apple Vision Pro
text:  developer-roadmap - Interactive roadmaps, guides and other educational content to help developers grow in their careers. - roadmap.sh Community driven roadmaps, articles and resources for developers Roadmaps are now interactive, you can click the nodes to read more about the topics.View all Roadmaps Best Practices QuestionsHere is the list of available roadmaps with more being actively worked upon.Frontend Roadmap / Frontend Beginner RoadmapBackend RoadmapDevOps Roadmap / DevOps Beginner RoadmapFull Stack RoadmapComputer Scie


Your max_length is set to 128, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)


generated keywords:  developer-roadmap, Roadmaps, Guides, Educational Content, DevOps
original keywords:  developer-roadmap, Developer Roadmap, Interactive Roadmaps, Frontend Roadmap, Backend Roadmap, DevOps Roadmap, Full Stack Roadmap, Computer Science, Road Map
text:  Leaflet - JavaScript library for mobile-friendly interactive maps - Leaflet was created 11 years ago by Volodymyr Agafonkin, a Ukrainian citizen living in Kyiv.Russian bombs are now falling over Volodymyr's hometown. His family, his friends, his neighbours, thousands and thousands of absolutely wonderful people, are either seeking refuge or fighting for their lives.Russian soldiers have already killed tens of thousands of civilians, including women and children, and are committing mass war crimes like gang rapes, executions, looting, and targeted bombings of c


Your max_length is set to 128, but your input_length is only 39. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)


generated keywords:  Leaflet, JavaScript, Mobile-Friendly, Interactive Maps, Ukrainian Citizen, Ukraine, Russian Bombs
original keywords:  Leaflet, JavaScript, Interactive Maps, Volodymyr Agafonkin, Ukraine, Kyiv
text:  We Are In A Simulation | The Game Of Life Explained - “You are either the main character, a supporting actor, or an extra. Which one are you?”


Your max_length is set to 128, but your input_length is only 54. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)


generated keywords:  Simulation, Game Of Life, Supporting Actor, Extra
original keywords:  Simulation Theory, Game of Life
text:  OpenAI Dev Day lo más relevante - El 6 de Noviembre se llevó a cabo el primer OpenAI Dev Day en San Francisco, y como era de esperarse, se presentaron grandes sorpresas.


Your max_length is set to 128, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)


generated keywords:  , OpenAI Dev Day, San Francisco
original keywords:  OpenAI, DevDay, San Francisco
text:  Top 10 Programming Memes Every Developer Can Relate To - If you want to receive an exclusive programming meme every Monday to your inbox, sign up for my newsletter


Your max_length is set to 128, but your input_length is only 52. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)


generated keywords:  Programming Memes, Developer, Newsletter
original keywords:  Programming Memes, Developer, Newsletter
text:  Is sessionStorage useful in React applications? - Why would you ever save data to sessionStorage when you could just store it in the React application in the form of e.g. globalState ? Any benefits to storing data in sessionStorage in React?
generated keywords:  SessionStorage, React, Data Storage, GlobalState
original keywords:  sessionStorage, React, Global State
text:  Worth suing co-founders? - Took an equity position as technical co-founder plus cash incentive and they don’t want to pay now. Contract is a bit of mess on top of it. I’m owed multiple 6 figures for promised sweat equity, but the company isn’t generating what anyone thought it would. There was never a contingency based on success of the company or revenue generated. My choices are basically sue, or try to dissolve company and negotiate to keep all rights to all digital assets, trademarks, d

Your max_length is set to 128, but your input_length is only 30. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)


generated keywords:  Technical Co-founder, Lawsuit, Digital Assets, Trademarks, Domain, IP, Legal Issues
original keywords:  Equity, Technical Co-founder, founder, Contract, Sue, Intellectual Property, IP, Trademarks, Domain
text:  No. Americans Runners Have NOT Never Been Slower - A Critique of Run Repeat’s 2017 Mega Study on Marathons


Your max_length is set to 128, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


generated keywords:  Americans Runners, Marathons, Run Repeat, Study
original keywords:  American Runners, Run Repeat, Marathons, Study Critique
text:  spectest - API testing library for Go that generate E2E test result document in markdown - Generate document from end-to-end tests In recent years, Domain-Driven Design  has become mainstream, making it easier to implement unit tests for methods in each layer. However, implementing end-to-end  tests can be quite labor-intensive. To achieve a return on the effort invested, I am d... + tags: showdev, go, testing, api


Your max_length is set to 128, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)


generated keywords:  spectest, API Testing, Go, E2E, Markdown, Domain-Driven Design
original keywords:  spectest, API Testing, Go, E2E Tests, Markdown, Domain-Driven Design
text:  Data Cleansing & Manipulation - Data cleaning or Data cleansing and manipulation is a crucial step in a data project that involves identifying and correcting errors or…


Your max_length is set to 128, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


generated keywords:  Data Cleansing, Data Manipulation, Data Project
original keywords:  Data Cleansing, Data Manipulation, Data Science
text:  State Management with Nested Signals  - Experimental Angular state management with nested signals


Your max_length is set to 128, but your input_length is only 77. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)


generated keywords:  State Management, Angular, Nested Signals
original keywords:  State Management, Angular, Nested Signals
text:  Submit Black Friday Deal - Hey SaaS Founders, We're curating the ultimate list of top SaaS deals in our SaaS community, and we want your startup to be featured. Got an incredible Black Friday offer of your SaaS startup? Share it with us through the form below, we will include your SaaS offer to our list: 


Your max_length is set to 128, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)


generated keywords:  Black Friday, SaaS, Startup Offer
original keywords:  Black Friday, SaaS, Startup, Community, Deals
text:  Harnessing the Power of Behavioural Data in the Insurance Industry - Companies are progressively employing behavioral data analytics within the insurance sector to acquire more profound insights into customer…


Your max_length is set to 128, but your input_length is only 37. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)


generated keywords:  Behavioural Data, Insurance Industry, Analytics
original keywords:  Behavioural Data, Insurance Industry
text:  Understanding Feature Importance in Machine Learning: Beyond the Numbers - In the realm of machine learning, the concept of feature importance plays a crucial role in understanding how models make predictions…


Your max_length is set to 128, but your input_length is only 102. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


generated keywords:  Feature Importance, Machine Learning, Predictions
original keywords:  Feature Importance, Machine Learning
text:  Python Kullanarak Kitap Öneri Sistemi Oluşturmanın Temel Adımları (TFIDF ve Cosine Similarty… - Özet:  Kitap okuma deneyimini kişiselleştirmek için Python kullanarak nasıl bir kitap öneri sistemi oluşturabileceğinizi adım adım…


Your max_length is set to 128, but your input_length is only 85. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)


generated keywords:  Python, Temel Adımlar, TFIDF, Cosine Similarty
original keywords:  Python, Recommendation System, Book Recommendation, TFIDF, Cosine Similarity
text:  Advise for moving forward - It should be noted that I’m not a programmer. The extent of my programming has been manipulating calculation functions in already existing code. In other words, no experience. I work for a company that develops scanners and optimizers for lumber industry. With this comes the user interface to observe solutions based on data that the scanner sees and the parameter set in the optimizer. My company


Your max_length is set to 128, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)


generated keywords:  Programming, Optimizers, Lumber Industry, User Interface
original keywords:  Programming, Lumber Industry, User Interface, Scanners, Optimizers
text:  JavaScript Project with Zero JavaScript Knowledge?  - Can’t be that hard? Same as Java without the Script…


Your max_length is set to 128, but your input_length is only 42. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)


generated keywords:  JavaScript, Java, Project Management, Knowledge
original keywords:  JavaScript, Java
text:  Top 20 Front-End Interview Questions You Must Know - In this article, you’ll uncover a handpicked collection of some of the most prominent questions you might encounter in a frontend web…


Your max_length is set to 128, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


generated keywords:  Front-End Interview Questions, Web Development, Interview Questions
original keywords:  Front-End, Interview Questions
text:  Show HN: A custom GPT conversational language tutor that generates Anki cards


Your max_length is set to 128, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)


generated keywords:  GPT, Conversational Language Tutor, Anki Cards
original keywords:  GPT, Anki, Conversational Language Tutor
text:  DEM: Open-source containerized Development Environment Manager


Your max_length is set to 128, but your input_length is only 37. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)


generated keywords:  DEM, Containerized Development Environment Manager, Open Source
original keywords:  DEM, Open-source, Containerization, Development Environment Manager
text:  Exploring the Power and Simplicity of the Go Programming Language - Diving into Go’s Features, from Concurrency to Web Development, with Code Examples and Best Practices


Your max_length is set to 128, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)


generated keywords:  Go, Programming Language, Concurrency, Web Development
original keywords:  Go Programming Language, Concurrency, Web Development
text:  Bytes: A fast, lightweight and customizable text editor


Your max_length is set to 128, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


generated keywords:  Bytes, Text Editor, Lightweight, Customizable
original keywords:  Bytes, Text Editor
text:  Why we're building Scanner: data lake search must be fastscanner.dev


Your max_length is set to 128, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)


generated keywords:  Scanner, Data Lake Search, Fast Search
original keywords:  Scanner, Data Lake, Search Performance, Performance
text:  In-depth Understanding to Optimize the Performance of Artificial Neural Network - Hyper-parameters tuning for deep learning techniques


Your max_length is set to 128, but your input_length is only 35. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)


generated keywords:  Artificial Neural Network, Deep Learning, Hyperparameters Tuning
original keywords:  Artificial Neural Network, Hyper-parameters, Deep Learning, Performance Optimization
text:  Deep Dive into Continuous Distributions and Bayesian Inference - In the realms of statistics and probability theory, continuous distributions are pivotal for modeling and understanding various types of…


Your max_length is set to 128, but your input_length is only 20. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)


generated keywords:  Continuous Distributions, Bayesian Inference, Statistics, Probability Theory
original keywords:  Continuous Distributions, Bayesian Inference, Statistics, Probability Theory
text:  How To Setup A Cloudflare Tunnel And Expose Your Local Service Or Application. -


Your max_length is set to 128, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)


generated keywords:  Cloudflare, Tunnel, Local Service, Application
original keywords:  Cloudflare Tunnel, Cloudflare, Local Service, Application Exposure
text:  Joby Performs First EVTOL Test Flights in New York


Your max_length is set to 128, but your input_length is only 52. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)


generated keywords:  Joby, EVTOL, Test Flights, New York
original keywords:  Joby, EVTOL, Test Flights, New York
text:  Find Your AI Solutions at the ODSC West AI Expo - At the AI Expo and Demo Hall as part of ODSC West in a few weeks, you’ll have the opportunity to meet one-on-one with representatives from…


Your max_length is set to 128, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)


generated keywords:  AI Solutions, ODSC West, AI Expo, Demo Hall
original keywords:  AI Solutions, ODSC West, AI Expo
text:  Python Fundamentals you should have a good understanding on as a beginner coder - From Vanilla JavaScript, React.js to Python and SQL in Just 6 Weeks


Your max_length is set to 128, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)


generated keywords:  Python, JavaScript, React.js, SQL, Beginner Coding
original keywords:  Python, JavaScript, React.js, SQL, Coding Fundamentals
text:  How to write a program that shuts off pc after a few seconds of inactivity - I have never written a program, but I would like to make something which allows me to monitor either a program's activity or keyboard activity and shuts off my pc or a program after a set amount of time . The motivation is that I want to sometimes pressure myself into writing quickly, and something like this would force me to write my thoughts out on t


Your max_length is set to 128, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)


generated keywords:  , PC Shutoff, Keyboard Activity, Software Monitoring
original keywords:  Program, PC Shutdown, Inactivity, Writing Productivity
text:  The Foundation Models Reshaping Computer Vision - Learn about the Foundation Models — for object classification, object detection, and segmentation —  that are redefining Computer Vision.


Your max_length is set to 128, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


generated keywords:  Foundation Models, Computer Vision, Object Classification, Object Detection, Segmentation
original keywords:  Foundation Models, Computer Vision, Object Classification, Object Detection, Segmentation
text:  Unable to SSH into AWS hosted Ubuntu server. What am I missing? - I created a VPC, a subnet within it, attached an internet gateway, created a network interface, attached an elastic IP to the network interface and stood up the sever within the subnet in said VPC. The server is the only infrastructure inside of the VPC. I am still not able to connect to the server using SSH. Would I need to attach the elastic IP to the server to make this happen? Or add a load balancer to the VPC that targets traffic to the server?


Your max_length is set to 128, but your input_length is only 80. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


generated keywords:  AWS, Ubuntu, SSH, Elastic IP, Load Balancer
original keywords:  SSH, AWS, Ubuntu, VPC, Elastic IP, Network Interface
text:  Stop using Lambda Layers  - Lambda layers are a special packaging mechanism provided by AWS Lambda to manage dependencies for zip-based Lambda functions. Layers themselves are nothing more than a sparkling zip file, but they have a few interesting properties which prove useful in some cases. Unfortunately Lambda layers are als... + tags: aws, lambda, webdev, programming


Your max_length is set to 128, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)


generated keywords:  Lambda Layers, AWS, Lambda, Web Development, Programming
original keywords:  AWS, Lambda, Lambda Layers, Layers, Dependencies, Packaging
text:  Satiresoft: Why aren’t we doing AI? - This week at Satiresoft…


Your max_length is set to 128, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)


generated keywords:  Satiresoft, AI, AI Development
original keywords:  Satiresoft, AI
text:  At the Intersection of LLMs and Kernels - Research Roundup -


Your max_length is set to 128, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)


generated keywords:  LLMs, Kernels, Research Roundup
original keywords:  LLMs, Kernels, Research
text:  Alan Wake 2 is an unexpected visual marvel even on older GPUsxfire.com
generated keywords:  Alan Wake 2, GPUs, Visual Wonders
original keywords:  Alan Wake 2, GPUs, Visual Marvel
text:  I am worried with how AI is improving propaganda - If you’ve been following the war in the Middle East on Twitter, you’ve probably noticed how difficult it’s been to distinguish what is real and what isn’t. Without getting into the gruesome details, a few days ago there was an image going around of a deceased child and there was heavy discussion into whether it was ai generated or real. There are some ai identifying tools but they appear to be very unreliable at the moment, so we don’t have a concrete way to make this distinction. I don’t know what the final findings were for that particular image, but either way it exemplifies how AI can be used to make dangerous propaganda. Apparently Google is deve

Your max_length is set to 128, but your input_length is only 30. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)


generated keywords:  AI, Propaganda, Google, Digital Watermark, Legislation
original keywords:  AI, Propaganda, Digital Watermark, Google, Legislation
text:  Frank & FuelStat Launch - This month the Fuelet Wallet team has introduced two analytic tools for the Fuel Network users. Those are:


Your max_length is set to 128, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)


generated keywords:  Frank, FuelStat, Fuelet Wallet, Fuel Network
original keywords:  Frank, FuelStat, Fuelet Wallet, Fuel Network, Analytics Tools
text:  Big Tech on Defensive as Copyright Threat Looms Over Generative AI. - Photo by Xu Haiwei on Unsplash
generated keywords:  Big Tech, Copyright Threat, Generative AI, Xu Haiwei
original keywords:  Big Tech, Copyright, Generative AI, Xu Haiwei


In [24]:
# you would replace your own name here
# you do not need to create a repository beforehand
trainer.push_to_hub("wbcmthh42/tech-keywords-extractor_plp")

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]
[A
training_args.bin: 100%|██████████| 5.18k/5.18k [00:00<00:00, 18.3kB/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


CommitInfo(commit_url='https://huggingface.co/wbcmthh42/bart_tech_keywords/commit/aaa476e530d52c5ef3c58dd4029c1cdf966f6d6c', commit_message='wbcmthh42/tech-keywords-extractor_plp', commit_description='', oid='aaa476e530d52c5ef3c58dd4029c1cdf966f6d6c', pr_url=None, pr_revision=None, pr_num=None)