# Load Library

In [1]:
import pandas as pd
import re
import string
import tensorflow as tf
import datasets

import textract
import pdfminer
import io
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

# Data Preprocessing

## Load Data

Dataset : https://huggingface.co/datasets/burberg92/resume_summary 

In [2]:
from datasets import load_dataset

dataset = load_dataset("burberg92/resume_summary", split="train")
dataset

Downloading readme:   0%|          | 0.00/241 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to C:/Users/Alfian/.cache/huggingface/datasets/burberg92___parquet/burberg92--resume_summary-c0da54b913be772e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/36.8k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to C:/Users/Alfian/.cache/huggingface/datasets/burberg92___parquet/burberg92--resume_summary-c0da54b913be772e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Dataset({
    features: ['resume', 'ex_summary'],
    num_rows: 100
})

In [3]:
dataset['resume']

["Resume: Laura Anderson | Product Manager\nExperience:\nProduct lifecycle management, market research, and roadmap development (5 years)\nAgile methodologies and collaboration with cross-functional teams\nEducation:\nBachelor's degree in Business Administration, NOP University",
 "Resume: Steven Thompson | Operations Manager\nExperience:\nProcess improvement, cost reduction, and team management (8 years)\nSix Sigma Green Belt certified\nEducation:\nMaster's degree in Business Administration, QRS University",
 "Linda Harris | Event Planner\nExperience:\nEvent coordination, budget management, and vendor negotiations (6 years)\nExperience in planning corporate events, conferences, and weddings\nEducation:\nBachelor's degree in Hospitality Management, TUV University",
 "Michael Clark | Customer Service Representative\nExperience:\nHandling customer inquiries, problem resolution, and upselling (4 years)\nExcellent communication and interpersonal skills\nEducation:\nAssociate's degree in Bu

## Data Cleaning

In [4]:
def clean_summ(res):
  res = res.replace("Resume: ","")
  res = res.replace("Name: ","")
  res = res.replace(" | "," ")
  res = res.replace("•","")
  res = res.replace("_","")  
  return res

print(clean_summ("Resume: Laura Anderson | Product Manager\nExperience:\nProduct lifecycle management, market research, and roadmap development (5 years)\nAgile methodologies and collaboration with cross-functional teams\nEducation:\nBachelor's degree in Business Administration, NOP University"))

Laura Anderson Product Manager
Experience:
Product lifecycle management, market research, and roadmap development (5 years)
Agile methodologies and collaboration with cross-functional teams
Education:
Bachelor's degree in Business Administration, NOP University


In [5]:
df = pd.DataFrame()
df['resume'] = dataset['resume']
df['ex_summary'] = dataset['ex_summary']
df.head()

Unnamed: 0,resume,ex_summary
0,Resume: Laura Anderson | Product Manager\nExpe...,Results-driven Product Manager with 5 years of...
1,Resume: Steven Thompson | Operations Manager\n...,Efficient Operations Manager with 8 years of e...
2,Linda Harris | Event Planner\nExperience:\nEve...,Detail-oriented Event Planner with 6 years of ...
3,Michael Clark | Customer Service Representativ...,Customer-focused Customer Service Representati...
4,Carol Martinez | Content Writer\nExperience:\n...,Creative Content Writer with 5 years of experi...


In [6]:
df['resume_clean'] = df['resume'].apply(clean_summ)

In [7]:
df.head()

Unnamed: 0,resume,ex_summary,resume_clean
0,Resume: Laura Anderson | Product Manager\nExpe...,Results-driven Product Manager with 5 years of...,Laura Anderson Product Manager\nExperience:\nP...
1,Resume: Steven Thompson | Operations Manager\n...,Efficient Operations Manager with 8 years of e...,Steven Thompson Operations Manager\nExperience...
2,Linda Harris | Event Planner\nExperience:\nEve...,Detail-oriented Event Planner with 6 years of ...,Linda Harris Event Planner\nExperience:\nEvent...
3,Michael Clark | Customer Service Representativ...,Customer-focused Customer Service Representati...,Michael Clark Customer Service Representative\...
4,Carol Martinez | Content Writer\nExperience:\n...,Creative Content Writer with 5 years of experi...,Carol Martinez Content Writer\nExperience:\nCo...


# FineTune BART Model

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

model = TFAutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


## Tokenizer

In [None]:
inputs = tokenizer(df['resume_clean'].values.tolist(), max_length=512, padding="max_length", return_tensors="tf").input_ids
labels = tokenizer(df['ex_summary'].values.tolist(), max_length=128, padding="max_length", return_tensors="tf").input_ids

In [None]:
len(labels)

100

In [None]:
model.summary()

Model: "tf_bart_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFBartMainLayer)     multiple                  406290432 
                                                                 
 final_logits_bias (BiasLaye  multiple                 50264     
 r)                                                              
                                                                 
Total params: 406,340,696
Trainable params: 406,290,432
Non-trainable params: 50,264
_________________________________________________________________


In [None]:
from tensorflow.keras.optimizers import Adam

num_epochs = 1
batch_size = 4

model.compile(optimizer=Adam(5e-5))
model.fit(x=inputs, y=labels,epochs=num_epochs, batch_size=batch_size)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.




<keras.callbacks.History at 0x7fbe11afbe50>

In [57]:
test = "Alice Clark  AI / Machine Learning    Delhi, India Email me on Indeed  •  20+ years of experience in data handling, design, and development  •  Data Warehouse: Data analysis, star/snow flake scema data modelling and design specific to  data warehousing and business intelligence  •  Database: Experience in database designing, scalability, back-up and recovery, writing and  optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes.  Cloud platform: Worked on Microsoft Azure cloud services like Document DB, SQL Azure,  Stream Analytics, Event hub, Power BI, Web Job, Web App, Power BI, Azure data lake  analytics(U-SQL)  Willing to relocate anywhere    WORK EXPERIENCE  Software Engineer  Microsoft – Bangalore, Karnataka  January 2000 to Present  1. Microsoft Rewards Live dashboards:  Description: - Microsoft rewards is loyalty program that rewards Users for browsing and shopping  online. Microsoft Rewards members can earn points when searching with Bing, browsing with  Microsoft Edge and making purchases at the Xbox Store, the Windows Store and the Microsoft  Store. Plus, user can pick up bonus points for taking daily quizzes and tours on the Microsoft  rewards website. Rewards live dashboards gives a live picture of usage world-wide and by  markets like US, Canada, Australia, new user registration count, top/bottom performing rewards  offers, orders stats and weekly trends of user activities, orders and new user registrations. the  PBI tiles gets refreshed in different frequencies starting from 5 seconds to 30 minutes.  Technology/Tools used    EDUCATION  Indian Institute of Technology – Mumbai  2001    SKILLS  Machine Learning, Natural Language Processing, and Big Data Handling    ADDITIONAL INFORMATION  Professional Skills  • Excellent analytical, problem solving, communication, knowledge transfer and interpersonal  skills with ability to interact with individuals at all the levels  • Quick learner and maintains cordial relationship with project manager and team members and  good performer both in team and independent job environments  • Positive attitude towards superiors &amp; peers  • Supervised junior developers throughout project lifecycle and provided technical assistance"

In [58]:
test_1 = tokenizer(test, max_length=512, padding="max_length", return_tensors="tf").input_ids

In [59]:
outputs = model.generate(test_1)



In [60]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Analytical AI / Machine Learning Software Engineer with 20+ years of experience in data handling, design, and development. Skilled in database designing, scalability, back-up and recovery, writing and optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes, with experience in Document DB, SQL Azure,  Stream Analytics, Event hub, Power BI, Web Job, Web App, and Power BI. Holds a Bachelor's degree in Machine Learning from IJST University.


In [24]:
def extract_text_from_pdf(file_path):
    text = textract.process(file_path, method='pdfminer')
    text = text.decode('utf-8')
    return text
text=extract_text_from_pdf('/Muhammad Alfian Pratama new resume.pdf')   # Enter the path to the resume here

MissingFileError: The file "/Muhammad Alfian Pratama new resume.pdf" can not be found.
Is this the right path/to/file/you/want/to/extract.pdf?

In [None]:
test_2 = tokenizer(text, max_length=512, padding="max_length", return_tensors="tf").input_ids

In [None]:
outputs = model.generate(test_2,min_length=50, max_length=150,early_stopping=True)

In [None]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Analytical Data Scientist with expertise in Data Science and Machine Learning. Proficient in Python and R programming languages, and proficient in TensorFlow and Flask. Holds a Bachelor of Data Science degree from FTMM Universitas Airlangga.


In [None]:
model.save_pretrained('/content/drive/MyDrive/cv model/cv_summarization_model2')

# Load model

In [2]:
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

# Replace with your custom model of choice
model = TFAutoModelForSeq2SeqLM.from_pretrained(r'D:\Kuliah\Tugas, PPT, Buku Kuliah\Semester 6\Bangkit\machinelearning\API\model\cv_summarization_model1')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-cnn')

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at D:\Kuliah\Tugas, PPT, Buku Kuliah\Semester 6\Bangkit\machinelearning\API\model\cv_summarization_model1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [56]:
test = 'Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-download&ikw=download-top&co=IN https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-download&ikw=download-top&co=IN   SKILLS  APEX. (Less than 1 year), Data Structures (3 years), FLEXCUBE (5 years), Oracle (5 years), Algorithms (3 years)  LINKS  https://www.linkedin.com/in/govardhana-k-61024944/  ADDITIONAL INFORMATION  Technical Proficiency:  Languages: Core Java, Go Lang, Data Structures & Algorithms, Oracle PL-SQL programming, Sales Force with APEX. Tools: RADTool, Jdeveloper, NetBeans, Eclipse, SQL developer, PL/SQL Developer, WinSCP, Putty Web Technologies: JavaScript, XML, HTML, Webservice  Operating Systems: Linux, Windows Version control system SVN & Git-Hub Databases: Oracle Middleware: Web logic, OC4J Product FLEXCUBE: Oracle FLEXCUBE Versions 10.x, 11.x and 12.x  https://www.linkedin.com/in/govardhana-k-61024944/'

In [57]:
test_2 = tokenizer(test, max_length=512, padding="max_length", return_tensors="tf").input_ids

In [58]:
outputs = model.generate(test_2,min_length=50, max_length=150,early_stopping=True)

In [60]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Experienced Senior Software Engineer with 5 years of experience in Core Java, Go Lang Oracle PL-SQL programming, Sales Force with APEX. Proficient in tools such as RADTool, Jdeveloper, NetBeans, Eclipse, SQL developer, PL/SQL and WinSCP. Holds a B.E in Computer Science Engineering from Adithya Institute of Technology.


In [36]:
# importing all the required modules
import PyPDF2

# creating a pdf reader object
reader = PyPDF2.PdfReader("Muhammad Alfian Pratama new resume.pdf")
text = reader.pages[0].extract_text()

text = clean_summ(text)

In [52]:
text = "Muhammad Alfian Pratama  \n LinkedIn    +62-855-2078 -1007     alfianp613.github.io     alfianp613@gmail.com      GitHub  \nI’m a 6th semester student curious and interested in Data Science and Machine Learning . I possess advanced proficiency in Python and \nR programming languages, and I have honed my skills in prominent frameworks such as TensorFlow and Flask. I am currently seek ing an \nopportunity to expand and apply my skills through a one semester industry plac ement, with a particular focus on data -related roles. I \nam eager to delve deeper into the practical aspects of the field and gain invaluable real -world experience.  \n \nSkills      \n \n   Python R HTML CSS Javascript Tableau Microsoft Excel Flask Tensorflow SPSS Minitab MySQL NoSQL Fire base  \n   Machine Learning Data Science Data Analytics Statistics Microservices Backend English  \nEducation     \nMachine Learning Learning Path   Bangkit  Academy 2023 By Google, \nGoTo, & Traveloka  Indonesia  02/2023 - Current  \n “Magang dan Studi Independen Bersertifikat” Batch 4 held by Kemendikbud RI  \n Student with Ahead of Schedule Status  \n Accomplish 6 Specialization Courses from Coursera such as Google Data Analytics, Google IT Automation, Mathematics for Machin e Learning \nSpecialization, Machine Learning Specialization, DeepLearning.AI TensorFlow Developer Specialization, and TensorFlow: Da ta and Deployment \nSpecialization  \nBachelor of Data Science   Universitas Airlangga  Surabaya , Indonesia  09/2020 - Current  \n Major in Data Science Technology  (GPA 3.88/4) . \n 3rd most outstanding FTMM Universitas Airlangga Student  \n Related Courses: Programming Algorithm, Calculus, Linear Algebra, Parametric Statistics, Non Parametric Statistics, Probabilit y, Computational \nStatistics,  Mathematical Statistics, Multivariate Statistics, Stochastic Process, Survival Analysis, Data Min ing, Natural Language Processing, \nDatabases, Spatial Data   Analysis.  \nWork Experience    \nLaboratory Assistant   FTMM Universitas Airlangga  Surabaya , Indonesia  03/2022 - 07/2022  \n Laboratory Assistant for Programming Algorithm course s. \n Assisted Lecturer for scoring practice modules and mentoring for 25 students  \n \nProjects     \nBicara Pilpres ( Sentiment Analysis Dashboard for Indonesian Presidential \nElection 2024 Candidates ) Surabaya, Indonesia  12/2022 - 01/2023  \n Led the end-to-end development process such as front -end design, software architecture, machine learning model, pipeline.  \n Successfully implemented a microservices architecture, utilizing Python Flask for the main website backend and machine learni ng API, to enhanc e \nthe scalability and efficiency of the web application.  Also, implemented Firebase to store data using Firebase Firestore Database (NoSQL) and \nFirebase Storage.  \n Implemented deployment of the web application on Digital Ocean Droplets, configuring SSL certi fication and domain integration using Nginx.  \nAchieved secure and reliable web hosting, ensuring optimal performance and user experience.  \n Implemented Progressive Web Application (PWA) for better user experience.  \n This project got Top 50 Hackfest 2023 held by  GDSC Indonesia.  \nSIBI (Sistem Bahasa Isyarat  Indonesai) Sign Language Alphabetic \nClassification using Mediapipe  Surabaya, Indonesia  11/2022 - 12/2022  \n Implemented Tensorflow Data Generator for augmenting data to reproduce primary data.  \n Utilized the power of Mediapipe to effectively extract hand landmark points, enabling the training of a cutting -edge machine learning model and \nthe development of real -time detection capabilities.  \n Achieving 94% accuracy on training data and 90% accuracy on  testing data.  "

In [55]:
test_2 = tokenizer(text, max_length=512, padding="max_length", return_tensors="np").input_ids

In [56]:
outputs = model.generate(test_2,min_length=50, max_length=150,early_stopping=True)

In [23]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Talented Data Science and Machine Learning Developer with advanced proficiency in Python and                 R programming languages. Proficient in prominent frameworks such as TensorFlow and Flask, with experience in using Google Data Analytics, Google IT Automation, and domain integration using Nginx and GitHub. Holds a Bachelor's degree in Data Science from FTMM Universitas Airlangga.


In [15]:
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
token = "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"
# Replace with your custom model of choice
model = TFAutoModelForSeq2SeqLM.from_pretrained('walkerrose/cv_summarization',use_auth_token=token)
tokenizer = AutoTokenizer.from_pretrained('walkerrose/cv_summarization',use_auth_token=token)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at walkerrose/cv_summarization.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/358 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [19]:
model

<transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration at 0x26682b7be50>