# Load Library

In [None]:
!pip install datasets
!pip install transformers[sentencepiece]
!pip install textract
!pip install pdfminer
!pip install huggingface_hub
!pip install keras_nlp
!pip install rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collec

In [None]:
import pandas as pd
import re
import string
import tensorflow as tf
import datasets
import numpy as np

# Modelling with distilbart-cnn-12-6

In [None]:
# The percentage of the dataset you want to split as train and test
TRAIN_TEST_SPLIT = 0.2

MAX_INPUT_LENGTH = 1024  # Maximum length of the input to the model
MIN_TARGET_LENGTH = 5  # Minimum length of the output by the model
MAX_TARGET_LENGTH = 128  # Maximum length of the output by the model
BATCH_SIZE = 1  # Batch-size for training our model
LEARNING_RATE = 2e-5  # Learning-rate for training our model
MAX_EPOCHS = 5  # Maximum number of epochs we will train the model for

# This notebook is built on the t5-small checkpoint from the Hugging Face Model Hub
MODEL_CHECKPOINT = "philschmid/tf-distilbart-cnn-12-6"

## Load Data

Dataset : https://huggingface.co/datasets/burberg92/resume_summary

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("burberg92/resume_summary", split="train")
raw_datasets



Dataset({
    features: ['resume', 'ex_summary'],
    num_rows: 100
})

## Split data

In [None]:
raw_datasets = raw_datasets.train_test_split(
    test_size=TRAIN_TEST_SPLIT
)

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['resume', 'ex_summary'],
        num_rows: 80
    })
    test: Dataset({
        features: ['resume', 'ex_summary'],
        num_rows: 20
    })
})

## Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
if MODEL_CHECKPOINT in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", "gopalkalpande/t5-small-finetuned-bbc-news-summarization"]:
    prefix = "summarize: "
else:
    prefix = ""

In [None]:
import re
def clean_summ(res):
  res = res.replace("Resume: ","")
  res = res.replace("Name: ","")
  res = res.replace(" | "," ")
  res = res.replace("•","")
  res = res.replace("_","")
  res = res.strip().replace('\n', ' ')
  res = re.sub(' +', ' ', res)
  return res

In [None]:
def preprocess_function(examples):
    inputs = [prefix + clean_summ(doc) for doc in examples["resume"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["ex_summary"], max_length=MAX_TARGET_LENGTH, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]



Map:   0%|          | 0/20 [00:00<?, ? examples/s]

## Model

In [None]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at philschmid/tf-distilbart-cnn-12-6.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [None]:
model.summary()

Model: "tf_bart_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFBartMainLayer)     multiple                  305510400 
                                                                 
 final_logits_bias (BiasLaye  multiple                 50264     
 r)                                                              
                                                                 
Total params: 305,560,664
Trainable params: 305,510,400
Non-trainable params: 50,264
_________________________________________________________________


In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
test_dataset = tokenized_datasets["test"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


### Fit Model

In [None]:
import keras
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer)

In [None]:
model.fit(
    train_dataset, validation_data=test_dataset, epochs=MAX_EPOCHS
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4144232740>

## Model Evaluation

In [None]:
from transformers import pipeline
pred = []
a = 0
for i in raw_datasets['test']['resume']:
  a+=1
  summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")
  output=summarizer(clean_summ(i),min_length=MIN_TARGET_LENGTH,max_length=45)
  pred.append(output[0]['summary_text'])
  print(a)

1
2
3
4
5
6
7
8
9
10
11
12
13
14


Your max_length is set to 45, but your input_length is only 38. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)


15
16
17
18


Your max_length is set to 45, but your input_length is only 42. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)


19
20


### Compute Rouge

In [None]:
result = rouge_metric.compute(predictions=pred, references=raw_datasets['test']['ex_summary'])

In [None]:
{key: value.mid.fmeasure * 100 for key, value in result.items()}

{'rouge1': 76.89955904516358,
 'rouge2': 73.29896070466081,
 'rougeL': 76.04005494797181,
 'rougeLsum': 75.87051973708014}

### Test with Data

In [None]:
import textract
def extract_text_from_pdf(file_path):
    text = textract.process(file_path, method='pdfminer')
    text = text.decode('utf-8')
    return text
text=extract_text_from_pdf('/content/Muhammad Alfian Pratama new resume.pdf')   # Enter the path to the resume here

In [None]:
test = clean_summ(text)

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

summarizer(test,min_length=MIN_TARGET_LENGTH,max_length=MAX_TARGET_LENGTH,)

[{'summary_text': 'Skilled Data Science student with advanced proficiency in Python and R programming languages. Proficient in TensorFlow and Flask, with a particular focus on data-related roles. I am eager to delve deeper into the practical aspects of the field and gain invaluable real-world experience. Holds a Bachelor of Data Science from FTMM Universitas Airlangga Surabaya, Indonesia.'}]

## Upload Model to Hungging Face

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("walkerrose/cv_summarization-distilbart-cnn-16-6")
tokenizer.push_to_hub("walkerrose/cv_summarization-distilbart-cnn-16-6")



Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tf_model.h5:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/1.22G [00:00<?, ?B/s]



CommitInfo(commit_url='https://huggingface.co/walkerrose/cv_summarization-distilbart-cnn-16-6/commit/91173916be5595b5357e653dad24dd129f3571b5', commit_message='Upload tokenizer', commit_description='', oid='91173916be5595b5357e653dad24dd129f3571b5', pr_url=None, pr_revision=None, pr_num=None)

# Modelling with T5

In [None]:
# The percentage of the dataset you want to split as train and test
TRAIN_TEST_SPLIT = 0.2

MAX_INPUT_LENGTH = 1024  # Maximum length of the input to the model
MIN_TARGET_LENGTH = 5  # Minimum length of the output by the model
MAX_TARGET_LENGTH = 128  # Maximum length of the output by the model
BATCH_SIZE = 4  # Batch-size for training our model
LEARNING_RATE = 2e-5  # Learning-rate for training our model
MAX_EPOCHS = 10  # Maximum number of epochs we will train the model for

# This notebook is built on the t5-small checkpoint from the Hugging Face Model Hub
MODEL_CHECKPOINT = "gopalkalpande/t5-small-finetuned-bbc-news-summarization"

## Load Data

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("burberg92/resume_summary", split="train")
raw_datasets



Dataset({
    features: ['resume', 'ex_summary'],
    num_rows: 100
})

## Split Data

In [None]:
raw_datasets = raw_datasets.train_test_split(
    test_size=TRAIN_TEST_SPLIT
)

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['resume', 'ex_summary'],
        num_rows: 80
    })
    test: Dataset({
        features: ['resume', 'ex_summary'],
        num_rows: 20
    })
})

## Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [None]:
if MODEL_CHECKPOINT in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", "gopalkalpande/t5-small-finetuned-bbc-news-summarization"]:
    prefix = "summarize: "
else:
    prefix = ""

In [None]:
def preprocess_function(examples):
    inputs = [prefix + clean_summ(doc) for doc in examples["resume"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["ex_summary"], max_length=MAX_TARGET_LENGTH, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]



Map:   0%|          | 0/20 [00:00<?, ? examples/s]

## Model

In [None]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at gopalkalpande/t5-small-finetuned-bbc-news-summarization.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
test_dataset = tokenized_datasets["test"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
generation_dataset = (
    tokenized_datasets["test"]
    .shuffle()
    .select(list(range(10)))
    .to_tf_dataset(
        batch_size=BATCH_SIZE,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
import keras
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer)

In [None]:
import keras_nlp

rouge_l = keras_nlp.metrics.RougeL()


def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
    # We will print only the F1 score, you can use other aggregation metrics as well
    result = {"RougeL": result["f1_score"]}

    return result

## Fit Model

In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=generation_dataset, predict_with_generate=True
)

callbacks = [metric_callback]

# For now we will use our test set as our validation_data
model.fit(
    train_dataset, validation_data=test_dataset, epochs=MAX_EPOCHS, callbacks=callbacks
)



Epoch 1/10



Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2c58a97160>

## Test with Data

In [None]:
test = "Alice Clark  AI / Machine Learning    Delhi, India Email me on Indeed  •  20+ years of experience in data handling, design, and development  •  Data Warehouse: Data analysis, star/snow flake scema data modelling and design specific to  data warehousing and business intelligence  •  Database: Experience in database designing, scalability, back-up and recovery, writing and  optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes.  Cloud platform: Worked on Microsoft Azure cloud services like Document DB, SQL Azure,  Stream Analytics, Event hub, Power BI, Web Job, Web App, Power BI, Azure data lake  analytics(U-SQL)  Willing to relocate anywhere    WORK EXPERIENCE  Software Engineer  Microsoft – Bangalore, Karnataka  January 2000 to Present  1. Microsoft Rewards Live dashboards:  Description: - Microsoft rewards is loyalty program that rewards Users for browsing and shopping  online. Microsoft Rewards members can earn points when searching with Bing, browsing with  Microsoft Edge and making purchases at the Xbox Store, the Windows Store and the Microsoft  Store. Plus, user can pick up bonus points for taking daily quizzes and tours on the Microsoft  rewards website. Rewards live dashboards gives a live picture of usage world-wide and by  markets like US, Canada, Australia, new user registration count, top/bottom performing rewards  offers, orders stats and weekly trends of user activities, orders and new user registrations. the  PBI tiles gets refreshed in different frequencies starting from 5 seconds to 30 minutes.  Technology/Tools used    EDUCATION  Indian Institute of Technology – Mumbai  2001    SKILLS  Machine Learning, Natural Language Processing, and Big Data Handling    ADDITIONAL INFORMATION  Professional Skills  • Excellent analytical, problem solving, communication, knowledge transfer and interpersonal  skills with ability to interact with individuals at all the levels  • Quick learner and maintains cordial relationship with project manager and team members and  good performer both in team and independent job environments  • Positive attitude towards superiors &amp; peers  • Supervised junior developers throughout project lifecycle and provided technical assistance"

In [None]:
test = "summarize: "+ clean_summ(test)

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

summarizer(test,min_length=MIN_TARGET_LENGTH,max_length=MAX_TARGET_LENGTH,)

[{'summary_text': 'Experience in data handling, design, and development Data Warehouse: Data analysis, star/snow flake scema data modelling and design specific to data warehousing and business intelligence Database: Experience in database designing, scalability, back-up and recovery, writing and optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes. Cloud platform: Worked on Microsoft Azure cloud services like Document DB, SQL Azure, Stream Analytics, Event hub, Power BI,'}]

## Upload to Hugging Face

In [None]:
model.push_to_hub("walkerrose/cv_summarization-t5-small")
tokenizer.push_to_hub("walkerrose/cv_summarization-t5-small")



Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tf_model.h5:   0%|          | 0.00/374M [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/374M [00:00<?, ?B/s]



spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/walkerrose/cv_summarization-t5-small/commit/04c5586b72b082e11be95287e11095484f0cac6b', commit_message='Upload tokenizer', commit_description='', oid='04c5586b72b082e11be95287e11095484f0cac6b', pr_url=None, pr_revision=None, pr_num=None)

# Summary

Summary Model:
- distilbart-cnn-16-6 model have a good performance but need expensive resources with RougeL = 76.04
- t5-small-finetuned-bbc-news-summarization have not enough performance for summarization and no need expensive resources with RougeL = 0.3339

We use distilbart-cnn-16-6 for production