In [None]:
import sys
sys.path.append('..')
import os

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import re
import time
from tqdm import tqdm
import re
from llm.llm.gemini import Gemini
from llm.llm_utils import get_code_from_text_response, get_json_from_text_response
llm = Gemini()
import json


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class JobScraperr:
    def __init__(self, base_url, output_csv):
        self.base_url = base_url
        self.output_csv = output_csv
        self.driver = self._initialize_driver()

        self.df = pd.DataFrame(columns=[
            "Job ID", "Title", "Link", "Job_Type", "Job_Location", 
            "Job_Category", "Salary", "Experience_Level", "Years_of_Experience",
            "Main_Job_Description", "Candidate_Experience_Requirements",  "Candidate_soft_skill_Requirements", "Candidate_technical_Requirements", "Candidate_degree_Requirements","Benefits", 
            "Additional_Notes", "Industry", "Address", "Academic_Degree", 
            "Sex_Requirement", "Degree_Level", 
            "Age_Requirement"
        ])
        if not os.path.exists(output_csv):
            self.df.to_csv(output_csv, index=False)

    def _initialize_driver(self):
        chrome_options = Options()
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-extensions")
        chrome_options.add_argument("--headless")

        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        return driver

    def clean_text(self, text):
        """Clean and standardize extracted text."""
        text = re.sub(r'<[^>]+>', '', text)  
        text = text.replace('\n', ' ').replace('<br>', ' ').replace('/', ' ').replace("*",'').replace('•','')
        text = re.sub(r'\s+', ' ', text).strip()  
        return text

    def get_job_details_from_gemini(self, html_content):
        prompt = (
            f"The following HTML contains a job posting:\n\n{html_content}\n\n"
            f"Extract the full text content of each specified section in JSON format. Clean the data by removing any unnecessary characters such as '/', '\\n', '<br>', and other HTML tags, bullet points (e.g., '-', '*', '•'), and other escape sequences. "
            f"Ensure that the content in each field uses ':' to separate key-value relationships (e.g., 'Key: Value') and ';' to separate items in a list (e.g., 'Item1; Item2; Item3'). "
            f"Return the content as plain text without formatting or unnecessary characters. If a section is missing, return 'Not available' as the value.\n\n"
            f"Combine the 'Years of Experience' and 'Experience Level' fields into 'Candidate Experience Requirements' if they exist. "
            f"If 'Candidate Experience Requirements' already contains content, append 'Years of Experience' and 'Experience Level' to it. If no data exists for these fields, leave the original field as is.\n\n"
            f"Expected JSON output:\n"
            f"{{\n"
            f"    \"Job_Type\": \"\",\n"
            f"    \"Job_Location\": \"\",\n"
            f"    \"Job_Category\": \"\",\n"
            f"    \"Main_Job_Description\": \"\",\n"
            f"    \"Experience_Level\": \"\",\n"
            f"    \"Years_of_Experience\": \"\",\n"
            f"    \"Candidate_Experience_Requirements\": \"Candidate_Experience_Requirement; Years_of_Experience: [Value] and Experience_Level: [Value]\" if available, else keep original content,\n"
            f"    \"Candidate_soft_skill_Requirements\": \"\",\n"
            f"    \"Candidate_technical_skill_Requirements\": \"\",\n"
            f"    \"Candidate_degree_Requirements\": \"\",\n"
            f"    \"Benefits\": \"\",\n"
            f"    \"Additional_Notes\": \"\",\n"
            f"    \"Industry\": \"\",\n"
            f"    \"Salary\": \"\",\n"
            f"    \"Address\": \"\",\n"
            f"    \"Academic_Degree\": \"\",\n"
            f"    \"Sex_Requirement\": \"\",\n"
            f"    \"Degree_Level\": \"\",\n"
            f"    \"Age_Requirement\": \"\",\n"
            f"    \"Job_Date-open\": \"\",\n"
            f"    \"Job_Date-end\": \"\"\n"
            f"}}"
        )


        message = [
            {
                'role': 'system',
                'content': 'You must extract the content in JSON format and translate into english'
            },
            {
                'role': 'user',
                'content': prompt
            }
        ]

        response = llm(message) 
        temp_cv = self.get_json_from_text_response(response)
        return temp_cv

    def get_json_from_text_response(self, response):
        """Extract JSON from text response."""
        cleaned_response = response.strip().lstrip("```json").rstrip("```").strip()

        try:
            json_data = json.loads(cleaned_response)
            if isinstance(json_data, dict):
                return {k: self.clean_text(v) if isinstance(v, str) else v for k, v in json_data.items()}
            else:
                return self._default_job_detail()
        except json.JSONDecodeError:
            print("Failed to parse JSON from response.")
            return self._default_job_detail()

    def _default_job_detail(self):
        return {col: 'Not available' for col in self.df.columns if col not in ["Job ID", "Title", "Link"]}

    def extract_job_detail(self, job_link, job_data):
        self.driver.execute_script("window.open(arguments[0], '_blank');", job_link)
        self.driver.switch_to.window(self.driver.window_handles[1])
        time.sleep(2)

        try:
            container_wrap = self.driver.find_element(By.CSS_SELECTOR, "div.container-wrap")
            header = self.driver.find_element(By.CSS_SELECTOR, "header.noo-page-heading")
            combined_html = header.get_attribute("outerHTML") + container_wrap.get_attribute("outerHTML")

            job_detail = self.get_job_details_from_gemini(combined_html) 
            
            job_detail.update(job_data)

            if job_detail:
                self.append_to_df(job_detail)

            self.driver.close()
            self.driver.switch_to.window(self.driver.window_handles[0])
        except Exception as e:
            print(f"Failed to extract job detail for link {job_link}: {e}")
            self.driver.close()
            self.driver.switch_to.window(self.driver.window_handles[0])

    def append_to_df(self, job_detail):
        job_df = pd.DataFrame([job_detail])
        self.df = pd.concat([self.df, job_df], ignore_index=True)
        print("Appended job detail to DataFrame.")

    def extract_jobs_from_page(self, job_elements):
        for job_element in job_elements:
            try:
                job_id = job_element.find_element(By.CSS_SELECTOR, "a.btn-quick-view-popup").get_attribute("data-id")
                title = job_element.find_element(By.CSS_SELECTOR, "h3 a").text.strip()
                job_link = job_element.find_element(By.CSS_SELECTOR, "a.job-details-link").get_attribute("href")

                job_data = {
                    "Job ID": job_id,
                    "Title": title,
                    "Link": job_link,
                }

                self.extract_job_detail(job_link, job_data)
                print(f"Processed job ID: {job_id}")
            except Exception as e:
                print(f"Error extracting job: {e}")

    def crawl_jobs(self, start_page, end_page):
        for page in range(start_page, end_page + 1):
            url = f"{self.base_url}{page}/"
            self.driver.get(url)
            time.sleep(1)

            try:
                WebDriverWait(self.driver, 5).until(
                    EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".noo-job-item.noo_job"))
                )
                job_elements = self.driver.find_elements(By.CSS_SELECTOR, ".noo-job-item.noo_job")
                print(f"Found {len(job_elements)} jobs on page {page}")
                
                self.extract_jobs_from_page(job_elements)
            except Exception as e:
                print(f"Error loading page {page}: {e}")

            time.sleep(1)

        self.driver.quit()
        self.save_to_csv()
        print(f"Data saved to {self.output_csv}")

    def save_to_csv(self):
        self.df.to_csv(self.output_csv, index=False)
        print("Data saved to CSV.")


In [4]:
scraper = JobScraperr(base_url="https://topjobvn.com/vi/jobs/page/", output_csv="test_topjob.csv")
scraper.crawl_jobs(start_page=1, end_page=20)

2024-11-26 17:38:50,410 - INFO - Get LATEST chromedriver version for google-chrome
2024-11-26 17:38:50,656 - INFO - Get LATEST chromedriver version for google-chrome
2024-11-26 17:38:50,830 - INFO - Driver [/Users/duongphuonggiang/.wdm/drivers/chromedriver/mac64/131.0.6778.85/chromedriver-mac-arm64/chromedriver] found in cache


Found 10 jobs on page 1


2024-11-26 17:39:10,161 - INFO - Completion time of gemini-1.5-flash-002: 8.55342698097229s


prompt_token_count: 81003
candidates_token_count: 381
total_token_count: 81384

Appended job detail to DataFrame.
Processed job ID: 61009


2024-11-26 17:39:22,180 - INFO - Completion time of gemini-1.5-flash-002: 9.840788841247559s


prompt_token_count: 78526
candidates_token_count: 460
total_token_count: 78986

Appended job detail to DataFrame.
Processed job ID: 60987


2024-11-26 17:39:41,211 - INFO - Completion time of gemini-1.5-flash-002: 16.81583595275879s


prompt_token_count: 78699
candidates_token_count: 536
total_token_count: 79235

Appended job detail to DataFrame.
Processed job ID: 60935


2024-11-26 17:39:57,737 - INFO - Completion time of gemini-1.5-flash-002: 14.305126905441284s


prompt_token_count: 78933
candidates_token_count: 607
total_token_count: 79540

Appended job detail to DataFrame.
Processed job ID: 60900


2024-11-26 17:40:14,329 - INFO - Completion time of gemini-1.5-flash-002: 13.330347061157227s


prompt_token_count: 79018
candidates_token_count: 794
total_token_count: 79812

Appended job detail to DataFrame.
Processed job ID: 60890


2024-11-26 17:40:27,581 - INFO - Completion time of gemini-1.5-flash-002: 11.093473196029663s


prompt_token_count: 79076
candidates_token_count: 797
total_token_count: 79873

Appended job detail to DataFrame.
Processed job ID: 60849


2024-11-26 17:40:37,956 - INFO - Completion time of gemini-1.5-flash-002: 8.142744064331055s


prompt_token_count: 78916
candidates_token_count: 475
total_token_count: 79391

Appended job detail to DataFrame.
Processed job ID: 60819


2024-11-26 17:40:48,577 - INFO - Completion time of gemini-1.5-flash-002: 8.432646989822388s


prompt_token_count: 79149
candidates_token_count: 516
total_token_count: 79665

Appended job detail to DataFrame.
Processed job ID: 60816


2024-11-26 17:41:01,360 - INFO - Completion time of gemini-1.5-flash-002: 10.234374046325684s


prompt_token_count: 79161
candidates_token_count: 688
total_token_count: 79849

Appended job detail to DataFrame.
Processed job ID: 60815


2024-11-26 17:41:10,839 - INFO - Completion time of gemini-1.5-flash-002: 7.151581048965454s


prompt_token_count: 78685
candidates_token_count: 382
total_token_count: 79067

Appended job detail to DataFrame.
Processed job ID: 60799
Found 10 jobs on page 2


2024-11-26 17:41:32,052 - INFO - Completion time of gemini-1.5-flash-002: 15.012386083602905s


prompt_token_count: 79011
candidates_token_count: 583
total_token_count: 79594

Appended job detail to DataFrame.
Processed job ID: 60757


2024-11-26 17:41:44,335 - INFO - Completion time of gemini-1.5-flash-002: 9.98383116722107s


prompt_token_count: 79377
candidates_token_count: 681
total_token_count: 80058

Appended job detail to DataFrame.
Processed job ID: 60756


2024-11-26 17:41:55,126 - INFO - Completion time of gemini-1.5-flash-002: 8.537138938903809s


prompt_token_count: 81207
candidates_token_count: 523
total_token_count: 81730

Appended job detail to DataFrame.
Processed job ID: 60731


2024-11-26 17:42:08,609 - INFO - Completion time of gemini-1.5-flash-002: 11.308289766311646s


prompt_token_count: 79634
candidates_token_count: 814
total_token_count: 80448

Appended job detail to DataFrame.
Processed job ID: 60718


2024-11-26 17:42:17,719 - INFO - Completion time of gemini-1.5-flash-002: 6.953402042388916s


prompt_token_count: 85212
candidates_token_count: 385
total_token_count: 85597

Appended job detail to DataFrame.
Processed job ID: 60680


2024-11-26 17:42:27,160 - INFO - Completion time of gemini-1.5-flash-002: 7.261707067489624s


prompt_token_count: 78697
candidates_token_count: 401
total_token_count: 79098

Appended job detail to DataFrame.
Processed job ID: 60657


2024-11-26 17:42:37,256 - INFO - Completion time of gemini-1.5-flash-002: 7.941969156265259s


prompt_token_count: 79244
candidates_token_count: 487
total_token_count: 79731

Appended job detail to DataFrame.
Processed job ID: 60652


2024-11-26 17:42:49,508 - INFO - Completion time of gemini-1.5-flash-002: 10.072591781616211s


prompt_token_count: 79506
candidates_token_count: 723
total_token_count: 80229

Appended job detail to DataFrame.
Processed job ID: 60646


2024-11-26 17:43:02,156 - INFO - Completion time of gemini-1.5-flash-002: 10.460994958877563s


prompt_token_count: 79427
candidates_token_count: 727
total_token_count: 80154

Appended job detail to DataFrame.
Processed job ID: 60634


2024-11-26 17:43:12,703 - INFO - Completion time of gemini-1.5-flash-002: 8.386210918426514s


prompt_token_count: 79294
candidates_token_count: 506
total_token_count: 79800

Appended job detail to DataFrame.
Processed job ID: 60612
Found 10 jobs on page 3


2024-11-26 17:43:27,911 - INFO - Completion time of gemini-1.5-flash-002: 8.161062717437744s


prompt_token_count: 78825
candidates_token_count: 518
total_token_count: 79343

Appended job detail to DataFrame.
Processed job ID: 60603


2024-11-26 17:43:40,810 - INFO - Completion time of gemini-1.5-flash-002: 9.717228174209595s


prompt_token_count: 78647
candidates_token_count: 422
total_token_count: 79069

Appended job detail to DataFrame.
Processed job ID: 60594


2024-11-26 17:43:53,504 - INFO - Completion time of gemini-1.5-flash-002: 10.514534950256348s


prompt_token_count: 78850
candidates_token_count: 741
total_token_count: 79591

Appended job detail to DataFrame.
Processed job ID: 60581


2024-11-26 17:44:04,311 - INFO - Completion time of gemini-1.5-flash-002: 8.65385913848877s


prompt_token_count: 83299
candidates_token_count: 547
total_token_count: 83846

Appended job detail to DataFrame.
Processed job ID: 60572


2024-11-26 17:44:13,507 - INFO - Completion time of gemini-1.5-flash-002: 7.012898683547974s


prompt_token_count: 85157
candidates_token_count: 380
total_token_count: 85537

Appended job detail to DataFrame.
Processed job ID: 60551


2024-11-26 17:44:22,743 - INFO - Completion time of gemini-1.5-flash-002: 7.070859909057617s


prompt_token_count: 79085
candidates_token_count: 380
total_token_count: 79465

Appended job detail to DataFrame.
Processed job ID: 60546


2024-11-26 17:44:40,128 - INFO - Completion time of gemini-1.5-flash-002: 15.223283052444458s


prompt_token_count: 78743
candidates_token_count: 606
total_token_count: 79349

Appended job detail to DataFrame.
Processed job ID: 60533


2024-11-26 17:44:50,460 - INFO - Completion time of gemini-1.5-flash-002: 8.134918928146362s


prompt_token_count: 79592
candidates_token_count: 438
total_token_count: 80030

Appended job detail to DataFrame.
Processed job ID: 60524


2024-11-26 17:45:00,271 - INFO - Completion time of gemini-1.5-flash-002: 7.329866886138916s


prompt_token_count: 79062
candidates_token_count: 438
total_token_count: 79500

Appended job detail to DataFrame.
Processed job ID: 60512


2024-11-26 17:45:10,051 - INFO - Completion time of gemini-1.5-flash-002: 7.498897314071655s


prompt_token_count: 78741
candidates_token_count: 418
total_token_count: 79159

Appended job detail to DataFrame.
Processed job ID: 60485
Found 10 jobs on page 4


2024-11-26 17:45:25,647 - INFO - Completion time of gemini-1.5-flash-002: 9.232197046279907s


prompt_token_count: 79032
candidates_token_count: 626
total_token_count: 79658

Appended job detail to DataFrame.
Processed job ID: 60484


2024-11-26 17:45:46,787 - INFO - Completion time of gemini-1.5-flash-002: 18.059166431427002s


prompt_token_count: 78929
candidates_token_count: 592
total_token_count: 79521

Appended job detail to DataFrame.
Processed job ID: 60460


2024-11-26 17:45:56,852 - INFO - Completion time of gemini-1.5-flash-002: 7.877413988113403s


prompt_token_count: 78585
candidates_token_count: 421
total_token_count: 79006

Appended job detail to DataFrame.
Processed job ID: 60447


2024-11-26 17:46:08,873 - INFO - Completion time of gemini-1.5-flash-002: 9.853055953979492s


prompt_token_count: 78564
candidates_token_count: 526
total_token_count: 79090

Appended job detail to DataFrame.
Processed job ID: 60421


2024-11-26 17:46:19,509 - INFO - Completion time of gemini-1.5-flash-002: 8.438004970550537s


prompt_token_count: 79343
candidates_token_count: 543
total_token_count: 79886

Appended job detail to DataFrame.
Processed job ID: 60416


2024-11-26 17:46:29,513 - INFO - Completion time of gemini-1.5-flash-002: 7.8457581996917725s


prompt_token_count: 79211
candidates_token_count: 469
total_token_count: 79680

Appended job detail to DataFrame.
Processed job ID: 60409


2024-11-26 17:46:42,107 - INFO - Completion time of gemini-1.5-flash-002: 10.44382095336914s


prompt_token_count: 78910
candidates_token_count: 663
total_token_count: 79573

Appended job detail to DataFrame.
Processed job ID: 60396


2024-11-26 17:46:53,370 - INFO - Completion time of gemini-1.5-flash-002: 9.096946001052856s


prompt_token_count: 78412
candidates_token_count: 341
total_token_count: 78753

Appended job detail to DataFrame.
Processed job ID: 60366


2024-11-26 17:47:07,602 - INFO - Completion time of gemini-1.5-flash-002: 12.066806077957153s


prompt_token_count: 78943
candidates_token_count: 455
total_token_count: 79398

Appended job detail to DataFrame.
Processed job ID: 60358


2024-11-26 17:47:18,645 - INFO - Completion time of gemini-1.5-flash-002: 8.845978736877441s


prompt_token_count: 80409
candidates_token_count: 512
total_token_count: 80921

Appended job detail to DataFrame.
Processed job ID: 60338
Found 10 jobs on page 5


2024-11-26 17:47:35,028 - INFO - Completion time of gemini-1.5-flash-002: 9.992362022399902s


prompt_token_count: 79077
candidates_token_count: 690
total_token_count: 79767

Appended job detail to DataFrame.
Processed job ID: 60331


2024-11-26 17:47:46,105 - INFO - Completion time of gemini-1.5-flash-002: 8.481086015701294s


prompt_token_count: 78535
candidates_token_count: 552
total_token_count: 79087

Appended job detail to DataFrame.
Processed job ID: 60317


2024-11-26 17:47:57,027 - INFO - Completion time of gemini-1.5-flash-002: 8.760410785675049s


prompt_token_count: 78877
candidates_token_count: 593
total_token_count: 79470

Appended job detail to DataFrame.
Processed job ID: 60300


2024-11-26 17:48:06,361 - INFO - Completion time of gemini-1.5-flash-002: 7.181812047958374s


prompt_token_count: 78558
candidates_token_count: 414
total_token_count: 78972

Appended job detail to DataFrame.
Processed job ID: 60293


2024-11-26 17:48:20,306 - INFO - Completion time of gemini-1.5-flash-002: 11.789440870285034s


prompt_token_count: 78781
candidates_token_count: 490
total_token_count: 79271

Appended job detail to DataFrame.
Processed job ID: 60284


2024-11-26 17:48:30,648 - INFO - Completion time of gemini-1.5-flash-002: 7.477229118347168s


prompt_token_count: 78665
candidates_token_count: 456
total_token_count: 79121

Appended job detail to DataFrame.
Processed job ID: 60277


2024-11-26 17:48:44,472 - INFO - Completion time of gemini-1.5-flash-002: 11.670357942581177s


prompt_token_count: 78897
candidates_token_count: 798
total_token_count: 79695

Appended job detail to DataFrame.
Processed job ID: 60253


2024-11-26 17:48:54,520 - INFO - Completion time of gemini-1.5-flash-002: 7.891085147857666s


prompt_token_count: 78575
candidates_token_count: 491
total_token_count: 79066

Appended job detail to DataFrame.
Processed job ID: 60245


2024-11-26 17:49:05,920 - INFO - Completion time of gemini-1.5-flash-002: 9.214032173156738s


prompt_token_count: 78653
candidates_token_count: 517
total_token_count: 79170

Appended job detail to DataFrame.
Processed job ID: 60231


2024-11-26 17:49:20,107 - INFO - Completion time of gemini-1.5-flash-002: 11.973876953125s


prompt_token_count: 79348
candidates_token_count: 734
total_token_count: 80082

Appended job detail to DataFrame.
Processed job ID: 60214
Found 10 jobs on page 6


2024-11-26 17:49:35,712 - INFO - Completion time of gemini-1.5-flash-002: 9.429997682571411s


prompt_token_count: 78917
candidates_token_count: 536
total_token_count: 79453

Appended job detail to DataFrame.
Processed job ID: 60189


2024-11-26 17:49:46,220 - INFO - Completion time of gemini-1.5-flash-002: 8.30571699142456s


prompt_token_count: 81451
candidates_token_count: 513
total_token_count: 81964

Appended job detail to DataFrame.
Processed job ID: 60161


2024-11-26 17:49:55,844 - INFO - Completion time of gemini-1.5-flash-002: 7.455184698104858s


prompt_token_count: 80863
candidates_token_count: 440
total_token_count: 81303

Appended job detail to DataFrame.
Processed job ID: 60135


2024-11-26 17:50:08,866 - INFO - Completion time of gemini-1.5-flash-002: 10.86169719696045s


prompt_token_count: 79295
candidates_token_count: 754
total_token_count: 80049

Appended job detail to DataFrame.
Processed job ID: 60107


2024-11-26 17:50:20,071 - INFO - Completion time of gemini-1.5-flash-002: 9.013404846191406s


prompt_token_count: 79115
candidates_token_count: 622
total_token_count: 79737

Appended job detail to DataFrame.
Processed job ID: 60102


2024-11-26 17:50:29,463 - INFO - Completion time of gemini-1.5-flash-002: 7.1510169506073s


prompt_token_count: 78552
candidates_token_count: 348
total_token_count: 78900

Appended job detail to DataFrame.
Processed job ID: 60074


2024-11-26 17:50:40,365 - INFO - Completion time of gemini-1.5-flash-002: 8.729723930358887s


prompt_token_count: 80181
candidates_token_count: 514
total_token_count: 80695

Appended job detail to DataFrame.
Processed job ID: 60067


2024-11-26 17:50:54,003 - INFO - Completion time of gemini-1.5-flash-002: 11.482275009155273s


prompt_token_count: 79356
candidates_token_count: 789
total_token_count: 80145

Appended job detail to DataFrame.
Processed job ID: 60063


2024-11-26 17:51:04,521 - INFO - Completion time of gemini-1.5-flash-002: 8.33965015411377s


prompt_token_count: 78957
candidates_token_count: 552
total_token_count: 79509

Appended job detail to DataFrame.
Processed job ID: 60056


2024-11-26 17:51:19,161 - INFO - Completion time of gemini-1.5-flash-002: 12.281008243560791s


prompt_token_count: 82948
candidates_token_count: 565
total_token_count: 83513

Appended job detail to DataFrame.
Processed job ID: 60029
Found 10 jobs on page 7


2024-11-26 17:51:34,280 - INFO - Completion time of gemini-1.5-flash-002: 8.816658973693848s


prompt_token_count: 80353
candidates_token_count: 589
total_token_count: 80942

Appended job detail to DataFrame.
Processed job ID: 60020


2024-11-26 17:51:44,765 - INFO - Completion time of gemini-1.5-flash-002: 8.227988004684448s


prompt_token_count: 79385
candidates_token_count: 497
total_token_count: 79882

Appended job detail to DataFrame.
Processed job ID: 60015


2024-11-26 17:51:53,057 - INFO - Completion time of gemini-1.5-flash-002: 6.0768938064575195s


prompt_token_count: 78290
candidates_token_count: 302
total_token_count: 78592

Appended job detail to DataFrame.
Processed job ID: 60000


2024-11-26 17:52:02,784 - INFO - Completion time of gemini-1.5-flash-002: 7.572990894317627s


prompt_token_count: 78793
candidates_token_count: 450
total_token_count: 79243

Appended job detail to DataFrame.
Processed job ID: 59993


2024-11-26 17:52:12,513 - INFO - Completion time of gemini-1.5-flash-002: 7.540548801422119s


prompt_token_count: 79004
candidates_token_count: 449
total_token_count: 79453

Appended job detail to DataFrame.
Processed job ID: 59988


2024-11-26 17:52:25,039 - INFO - Completion time of gemini-1.5-flash-002: 10.344902992248535s


prompt_token_count: 78838
candidates_token_count: 714
total_token_count: 79552

Appended job detail to DataFrame.
Processed job ID: 59987


2024-11-26 17:52:36,166 - INFO - Completion time of gemini-1.5-flash-002: 8.650516986846924s


prompt_token_count: 83095
candidates_token_count: 554
total_token_count: 83649

Appended job detail to DataFrame.
Processed job ID: 59955


2024-11-26 17:52:47,636 - INFO - Completion time of gemini-1.5-flash-002: 8.84125304222107s


prompt_token_count: 78975
candidates_token_count: 477
total_token_count: 79452

Appended job detail to DataFrame.
Processed job ID: 59962


2024-11-26 17:53:03,917 - INFO - Completion time of gemini-1.5-flash-002: 14.077590227127075s


prompt_token_count: 78347
candidates_token_count: 314
total_token_count: 78661

Appended job detail to DataFrame.
Processed job ID: 59943


2024-11-26 17:53:25,766 - INFO - Completion time of gemini-1.5-flash-002: 19.182899951934814s


prompt_token_count: 80939
candidates_token_count: 660
total_token_count: 81599

Appended job detail to DataFrame.
Processed job ID: 59932
Found 10 jobs on page 8


2024-11-26 17:53:58,097 - INFO - Completion time of gemini-1.5-flash-002: 25.9671311378479s


prompt_token_count: 78860
candidates_token_count: 529
total_token_count: 79389

Appended job detail to DataFrame.
Processed job ID: 59919


2024-11-26 17:54:07,920 - INFO - Completion time of gemini-1.5-flash-002: 7.640567064285278s


prompt_token_count: 78705
candidates_token_count: 467
total_token_count: 79172

Appended job detail to DataFrame.
Processed job ID: 59908


2024-11-26 17:54:18,255 - INFO - Completion time of gemini-1.5-flash-002: 8.150609970092773s


prompt_token_count: 80403
candidates_token_count: 475
total_token_count: 80878

Appended job detail to DataFrame.
Processed job ID: 59875


2024-11-26 17:54:29,421 - INFO - Completion time of gemini-1.5-flash-002: 8.57855486869812s


prompt_token_count: 80853
candidates_token_count: 543
total_token_count: 81396

Appended job detail to DataFrame.
Processed job ID: 59841


2024-11-26 17:54:43,855 - INFO - Completion time of gemini-1.5-flash-002: 12.227967977523804s


prompt_token_count: 78481
candidates_token_count: 466
total_token_count: 78947

Appended job detail to DataFrame.
Processed job ID: 59832


2024-11-26 17:54:53,969 - INFO - Completion time of gemini-1.5-flash-002: 7.882397890090942s


prompt_token_count: 78854
candidates_token_count: 466
total_token_count: 79320

Appended job detail to DataFrame.
Processed job ID: 59790


2024-11-26 17:55:05,261 - INFO - Completion time of gemini-1.5-flash-002: 9.129334926605225s


prompt_token_count: 80863
candidates_token_count: 464
total_token_count: 81327

Appended job detail to DataFrame.
Processed job ID: 59777


2024-11-26 17:55:15,654 - INFO - Completion time of gemini-1.5-flash-002: 8.214510202407837s


prompt_token_count: 81097
candidates_token_count: 513
total_token_count: 81610

Appended job detail to DataFrame.
Processed job ID: 59763


2024-11-26 17:55:26,254 - INFO - Completion time of gemini-1.5-flash-002: 8.390368938446045s


prompt_token_count: 78937
candidates_token_count: 541
total_token_count: 79478

Appended job detail to DataFrame.
Processed job ID: 59745


2024-11-26 17:55:35,635 - INFO - Completion time of gemini-1.5-flash-002: 7.189438819885254s


prompt_token_count: 82730
candidates_token_count: 389
total_token_count: 83119

Appended job detail to DataFrame.
Processed job ID: 59714
Found 10 jobs on page 9


2024-11-26 17:55:49,838 - INFO - Completion time of gemini-1.5-flash-002: 8.034801006317139s


prompt_token_count: 78357
candidates_token_count: 493
total_token_count: 78850

Appended job detail to DataFrame.
Processed job ID: 59713


2024-11-26 17:56:00,144 - INFO - Completion time of gemini-1.5-flash-002: 8.054445028305054s


prompt_token_count: 80488
candidates_token_count: 486
total_token_count: 80974

Appended job detail to DataFrame.
Processed job ID: 59708


2024-11-26 17:56:16,226 - INFO - Completion time of gemini-1.5-flash-002: 13.919278144836426s


prompt_token_count: 80241
candidates_token_count: 816
total_token_count: 81057

Appended job detail to DataFrame.
Processed job ID: 59701


2024-11-26 17:56:31,586 - INFO - Completion time of gemini-1.5-flash-002: 13.198068857192993s


prompt_token_count: 79161
candidates_token_count: 572
total_token_count: 79733

Appended job detail to DataFrame.
Processed job ID: 59702


2024-11-26 17:56:53,951 - INFO - Completion time of gemini-1.5-flash-002: 18.795450687408447s


prompt_token_count: 78949
candidates_token_count: 459
total_token_count: 79408

Appended job detail to DataFrame.
Processed job ID: 59672


2024-11-26 17:57:04,144 - INFO - Completion time of gemini-1.5-flash-002: 8.001374959945679s


prompt_token_count: 78733
candidates_token_count: 442
total_token_count: 79175

Appended job detail to DataFrame.
Processed job ID: 59656


2024-11-26 17:57:14,487 - INFO - Completion time of gemini-1.5-flash-002: 8.169989824295044s


prompt_token_count: 78768
candidates_token_count: 508
total_token_count: 79276

Appended job detail to DataFrame.
Processed job ID: 59645


2024-11-26 17:57:24,647 - INFO - Completion time of gemini-1.5-flash-002: 7.893312215805054s


prompt_token_count: 78579
candidates_token_count: 481
total_token_count: 79060

Appended job detail to DataFrame.
Processed job ID: 59619


2024-11-26 17:57:34,699 - INFO - Completion time of gemini-1.5-flash-002: 7.646028995513916s


prompt_token_count: 78606
candidates_token_count: 451
total_token_count: 79057

Appended job detail to DataFrame.
Processed job ID: 59601


2024-11-26 17:57:45,618 - INFO - Completion time of gemini-1.5-flash-002: 8.477909088134766s


prompt_token_count: 78796
candidates_token_count: 564
total_token_count: 79360

Appended job detail to DataFrame.
Processed job ID: 59594
Found 10 jobs on page 10


2024-11-26 17:58:02,103 - INFO - Completion time of gemini-1.5-flash-002: 10.215781927108765s


prompt_token_count: 79149
candidates_token_count: 746
total_token_count: 79895

Appended job detail to DataFrame.
Processed job ID: 59548


2024-11-26 17:58:12,547 - INFO - Completion time of gemini-1.5-flash-002: 8.264713048934937s


prompt_token_count: 82605
candidates_token_count: 522
total_token_count: 83127

Appended job detail to DataFrame.
Processed job ID: 59556


2024-11-26 17:58:24,528 - INFO - Completion time of gemini-1.5-flash-002: 9.333078145980835s


prompt_token_count: 78485
candidates_token_count: 480
total_token_count: 78965

Appended job detail to DataFrame.
Processed job ID: 59481


2024-11-26 17:58:40,407 - INFO - Completion time of gemini-1.5-flash-002: 13.655464887619019s


prompt_token_count: 80867
candidates_token_count: 568
total_token_count: 81435

Appended job detail to DataFrame.
Processed job ID: 59442


2024-11-26 17:58:51,075 - INFO - Completion time of gemini-1.5-flash-002: 8.500230073928833s


prompt_token_count: 80725
candidates_token_count: 540
total_token_count: 81265

Appended job detail to DataFrame.
Processed job ID: 59424


2024-11-26 17:59:03,947 - INFO - Completion time of gemini-1.5-flash-002: 10.697741031646729s


prompt_token_count: 79062
candidates_token_count: 808
total_token_count: 79870

Appended job detail to DataFrame.
Processed job ID: 59421


2024-11-26 17:59:14,596 - INFO - Completion time of gemini-1.5-flash-002: 8.434812307357788s


prompt_token_count: 78691
candidates_token_count: 504
total_token_count: 79195

Appended job detail to DataFrame.
Processed job ID: 59392


2024-11-26 17:59:23,817 - INFO - Completion time of gemini-1.5-flash-002: 7.047369956970215s


prompt_token_count: 78186
candidates_token_count: 380
total_token_count: 78566

Appended job detail to DataFrame.
Processed job ID: 59364


2024-11-26 17:59:32,249 - INFO - Completion time of gemini-1.5-flash-002: 6.279539346694946s


prompt_token_count: 78332
candidates_token_count: 337
total_token_count: 78669

Appended job detail to DataFrame.
Processed job ID: 59356


2024-11-26 17:59:42,691 - INFO - Completion time of gemini-1.5-flash-002: 8.283899068832397s


prompt_token_count: 78551
candidates_token_count: 532
total_token_count: 79083

Appended job detail to DataFrame.
Processed job ID: 59348
Found 10 jobs on page 11


2024-11-26 17:59:56,758 - INFO - Completion time of gemini-1.5-flash-002: 7.7564537525177s


prompt_token_count: 78618
candidates_token_count: 465
total_token_count: 79083

Appended job detail to DataFrame.
Processed job ID: 59340


2024-11-26 18:00:12,546 - INFO - Completion time of gemini-1.5-flash-002: 13.371994256973267s


prompt_token_count: 78502
candidates_token_count: 413
total_token_count: 78915

Appended job detail to DataFrame.
Processed job ID: 59233


2024-11-26 18:00:26,783 - INFO - Completion time of gemini-1.5-flash-002: 12.033226013183594s


prompt_token_count: 78991
candidates_token_count: 919
total_token_count: 79910

Appended job detail to DataFrame.
Processed job ID: 59246


2024-11-26 18:00:39,315 - INFO - Completion time of gemini-1.5-flash-002: 10.34695315361023s


prompt_token_count: 78878
candidates_token_count: 732
total_token_count: 79610

Appended job detail to DataFrame.
Processed job ID: 59216


2024-11-26 18:00:49,116 - INFO - Completion time of gemini-1.5-flash-002: 7.642349004745483s


prompt_token_count: 78452
candidates_token_count: 457
total_token_count: 78909

Appended job detail to DataFrame.
Processed job ID: 59204


2024-11-26 18:01:01,195 - INFO - Completion time of gemini-1.5-flash-002: 9.916209936141968s


prompt_token_count: 83462
candidates_token_count: 668
total_token_count: 84130

Appended job detail to DataFrame.
Processed job ID: 59186


2024-11-26 18:01:11,988 - INFO - Completion time of gemini-1.5-flash-002: 8.633717060089111s


prompt_token_count: 79027
candidates_token_count: 562
total_token_count: 79589

Appended job detail to DataFrame.
Processed job ID: 59158


2024-11-26 18:01:22,702 - INFO - Completion time of gemini-1.5-flash-002: 8.556946992874146s


prompt_token_count: 80742
candidates_token_count: 543
total_token_count: 81285

Appended job detail to DataFrame.
Processed job ID: 59127


2024-11-26 18:01:34,786 - INFO - Completion time of gemini-1.5-flash-002: 9.91161298751831s


prompt_token_count: 78864
candidates_token_count: 707
total_token_count: 79571

Appended job detail to DataFrame.
Processed job ID: 59122


2024-11-26 18:01:45,470 - INFO - Completion time of gemini-1.5-flash-002: 8.520273923873901s


prompt_token_count: 78878
candidates_token_count: 545
total_token_count: 79423

Appended job detail to DataFrame.
Processed job ID: 59115
Found 10 jobs on page 12


2024-11-26 18:01:59,935 - INFO - Completion time of gemini-1.5-flash-002: 8.112466096878052s


prompt_token_count: 78623
candidates_token_count: 414
total_token_count: 79037

Appended job detail to DataFrame.
Processed job ID: 59105


2024-11-26 18:02:13,598 - INFO - Completion time of gemini-1.5-flash-002: 11.391979932785034s


prompt_token_count: 79094
candidates_token_count: 627
total_token_count: 79721

Appended job detail to DataFrame.
Processed job ID: 59102


2024-11-26 18:02:24,657 - INFO - Completion time of gemini-1.5-flash-002: 8.863867044448853s


prompt_token_count: 78903
candidates_token_count: 579
total_token_count: 79482

Appended job detail to DataFrame.
Processed job ID: 59072


2024-11-26 18:02:34,026 - INFO - Completion time of gemini-1.5-flash-002: 7.118624925613403s


prompt_token_count: 78823
candidates_token_count: 410
total_token_count: 79233

Appended job detail to DataFrame.
Processed job ID: 59065


2024-11-26 18:02:44,585 - INFO - Completion time of gemini-1.5-flash-002: 8.369753122329712s


prompt_token_count: 78859
candidates_token_count: 533
total_token_count: 79392

Appended job detail to DataFrame.
Processed job ID: 59057


2024-11-26 18:02:54,699 - INFO - Completion time of gemini-1.5-flash-002: 7.917736768722534s


prompt_token_count: 79522
candidates_token_count: 482
total_token_count: 80004

Appended job detail to DataFrame.
Processed job ID: 59050


2024-11-26 18:03:07,439 - INFO - Completion time of gemini-1.5-flash-002: 10.566723823547363s


prompt_token_count: 79413
candidates_token_count: 652
total_token_count: 80065

Appended job detail to DataFrame.
Processed job ID: 59043


2024-11-26 18:03:19,746 - INFO - Completion time of gemini-1.5-flash-002: 9.658706188201904s


prompt_token_count: 80742
candidates_token_count: 655
total_token_count: 81397

Appended job detail to DataFrame.
Processed job ID: 59036


2024-11-26 18:03:29,170 - INFO - Completion time of gemini-1.5-flash-002: 7.21887993812561s


prompt_token_count: 78458
candidates_token_count: 405
total_token_count: 78863

Appended job detail to DataFrame.
Processed job ID: 59015


2024-11-26 18:03:42,497 - INFO - Completion time of gemini-1.5-flash-002: 10.596674680709839s


prompt_token_count: 89176
candidates_token_count: 775
total_token_count: 89951

Appended job detail to DataFrame.
Processed job ID: 59007
Found 10 jobs on page 13


2024-11-26 18:04:01,836 - INFO - Completion time of gemini-1.5-flash-002: 13.093360900878906s


prompt_token_count: 80606
candidates_token_count: 599
total_token_count: 81205

Appended job detail to DataFrame.
Processed job ID: 58982


2024-11-26 18:04:11,455 - INFO - Completion time of gemini-1.5-flash-002: 7.27599310874939s


prompt_token_count: 78495
candidates_token_count: 416
total_token_count: 78911

Appended job detail to DataFrame.
Processed job ID: 58991


2024-11-26 18:04:22,198 - INFO - Completion time of gemini-1.5-flash-002: 8.572990894317627s


prompt_token_count: 78765
candidates_token_count: 530
total_token_count: 79295

Appended job detail to DataFrame.
Processed job ID: 58959


2024-11-26 18:04:33,017 - INFO - Completion time of gemini-1.5-flash-002: 8.670532703399658s


prompt_token_count: 78799
candidates_token_count: 572
total_token_count: 79371

Appended job detail to DataFrame.
Processed job ID: 58935


2024-11-26 18:04:42,733 - INFO - Completion time of gemini-1.5-flash-002: 7.55413293838501s


prompt_token_count: 80244
candidates_token_count: 471
total_token_count: 80715

Appended job detail to DataFrame.
Processed job ID: 58927


2024-11-26 18:04:53,747 - INFO - Completion time of gemini-1.5-flash-002: 8.834779262542725s


prompt_token_count: 78544
candidates_token_count: 390
total_token_count: 78934

Appended job detail to DataFrame.
Processed job ID: 58913


2024-11-26 18:05:11,503 - INFO - Completion time of gemini-1.5-flash-002: 15.595256805419922s


prompt_token_count: 79260
candidates_token_count: 673
total_token_count: 79933

Appended job detail to DataFrame.
Processed job ID: 58902


2024-11-26 18:05:24,468 - INFO - Completion time of gemini-1.5-flash-002: 10.743517875671387s


prompt_token_count: 79102
candidates_token_count: 598
total_token_count: 79700

Appended job detail to DataFrame.
Processed job ID: 58890


2024-11-26 18:05:39,416 - INFO - Completion time of gemini-1.5-flash-002: 12.75571894645691s


prompt_token_count: 79016
candidates_token_count: 675
total_token_count: 79691

Appended job detail to DataFrame.
Processed job ID: 58864


2024-11-26 18:05:50,231 - INFO - Completion time of gemini-1.5-flash-002: 8.659763097763062s


prompt_token_count: 78814
candidates_token_count: 597
total_token_count: 79411

Appended job detail to DataFrame.
Processed job ID: 58858
Found 10 jobs on page 14


2024-11-26 18:06:05,485 - INFO - Completion time of gemini-1.5-flash-002: 8.977699041366577s


prompt_token_count: 80099
candidates_token_count: 412
total_token_count: 80511

Appended job detail to DataFrame.
Processed job ID: 58853


2024-11-26 18:06:16,491 - INFO - Completion time of gemini-1.5-flash-002: 8.815452814102173s


prompt_token_count: 78749
candidates_token_count: 573
total_token_count: 79322

Appended job detail to DataFrame.
Processed job ID: 58825


2024-11-26 18:06:28,582 - INFO - Completion time of gemini-1.5-flash-002: 9.83709192276001s


prompt_token_count: 78734
candidates_token_count: 559
total_token_count: 79293

Appended job detail to DataFrame.
Processed job ID: 58824


2024-11-26 18:06:38,344 - INFO - Completion time of gemini-1.5-flash-002: 7.548496961593628s


prompt_token_count: 78739
candidates_token_count: 447
total_token_count: 79186

Appended job detail to DataFrame.
Processed job ID: 58792


2024-11-26 18:06:49,139 - INFO - Completion time of gemini-1.5-flash-002: 8.626811981201172s


prompt_token_count: 78716
candidates_token_count: 517
total_token_count: 79233

Appended job detail to DataFrame.
Processed job ID: 58710


2024-11-26 18:06:59,196 - INFO - Completion time of gemini-1.5-flash-002: 7.798932075500488s


prompt_token_count: 81124
candidates_token_count: 454
total_token_count: 81578

Appended job detail to DataFrame.
Processed job ID: 58688


2024-11-26 18:07:09,227 - INFO - Completion time of gemini-1.5-flash-002: 7.853744029998779s


prompt_token_count: 78987
candidates_token_count: 459
total_token_count: 79446

Appended job detail to DataFrame.
Processed job ID: 58632


2024-11-26 18:07:17,852 - INFO - Completion time of gemini-1.5-flash-002: 6.38678765296936s


prompt_token_count: 78476
candidates_token_count: 338
total_token_count: 78814

Appended job detail to DataFrame.
Processed job ID: 58631


2024-11-26 18:07:29,470 - INFO - Completion time of gemini-1.5-flash-002: 9.353066205978394s


prompt_token_count: 78668
candidates_token_count: 499
total_token_count: 79167

Appended job detail to DataFrame.
Processed job ID: 58669


2024-11-26 18:07:43,568 - INFO - Completion time of gemini-1.5-flash-002: 11.919336080551147s


prompt_token_count: 78725
candidates_token_count: 497
total_token_count: 79222

Appended job detail to DataFrame.
Processed job ID: 58588
Found 10 jobs on page 15


2024-11-26 18:07:57,238 - INFO - Completion time of gemini-1.5-flash-002: 7.170166015625s


prompt_token_count: 80705
candidates_token_count: 407
total_token_count: 81112

Appended job detail to DataFrame.
Processed job ID: 58582


2024-11-26 18:08:09,379 - INFO - Completion time of gemini-1.5-flash-002: 9.941645860671997s


prompt_token_count: 79034
candidates_token_count: 663
total_token_count: 79697

Appended job detail to DataFrame.
Processed job ID: 58579


2024-11-26 18:08:21,320 - INFO - Completion time of gemini-1.5-flash-002: 9.737260103225708s


prompt_token_count: 79046
candidates_token_count: 513
total_token_count: 79559

Appended job detail to DataFrame.
Processed job ID: 58533


2024-11-26 18:08:31,823 - INFO - Completion time of gemini-1.5-flash-002: 8.338557004928589s


prompt_token_count: 79054
candidates_token_count: 532
total_token_count: 79586

Appended job detail to DataFrame.
Processed job ID: 58534


2024-11-26 18:08:41,274 - INFO - Completion time of gemini-1.5-flash-002: 7.285199880599976s


prompt_token_count: 78482
candidates_token_count: 411
total_token_count: 78893

Appended job detail to DataFrame.
Processed job ID: 58503


2024-11-26 18:08:52,302 - INFO - Completion time of gemini-1.5-flash-002: 8.865340948104858s


prompt_token_count: 78710
candidates_token_count: 453
total_token_count: 79163

Appended job detail to DataFrame.
Processed job ID: 58489


2024-11-26 18:09:11,807 - INFO - Completion time of gemini-1.5-flash-002: 17.172494173049927s


prompt_token_count: 79394
candidates_token_count: 640
total_token_count: 80034

Appended job detail to DataFrame.
Processed job ID: 58484


2024-11-26 18:09:20,839 - INFO - Completion time of gemini-1.5-flash-002: 6.834604024887085s


prompt_token_count: 78487
candidates_token_count: 390
total_token_count: 78877

Appended job detail to DataFrame.
Processed job ID: 58485


2024-11-26 18:09:30,292 - INFO - Completion time of gemini-1.5-flash-002: 7.140509843826294s


prompt_token_count: 80065
candidates_token_count: 359
total_token_count: 80424

Appended job detail to DataFrame.
Processed job ID: 58458


2024-11-26 18:09:39,554 - INFO - Completion time of gemini-1.5-flash-002: 6.93916392326355s


prompt_token_count: 78279
candidates_token_count: 339
total_token_count: 78618

Appended job detail to DataFrame.
Processed job ID: 58440
Found 10 jobs on page 16


2024-11-26 18:09:53,786 - INFO - Completion time of gemini-1.5-flash-002: 7.773674964904785s


prompt_token_count: 78688
candidates_token_count: 457
total_token_count: 79145

Appended job detail to DataFrame.
Processed job ID: 58437


2024-11-26 18:10:17,839 - INFO - Completion time of gemini-1.5-flash-002: 21.84380006790161s


prompt_token_count: 79468
candidates_token_count: 1106
total_token_count: 80574

Appended job detail to DataFrame.
Processed job ID: 58409


2024-11-26 18:10:28,882 - INFO - Completion time of gemini-1.5-flash-002: 8.790726900100708s


prompt_token_count: 82836
candidates_token_count: 541
total_token_count: 83377

Appended job detail to DataFrame.
Processed job ID: 58398


2024-11-26 18:10:43,299 - INFO - Completion time of gemini-1.5-flash-002: 12.22181510925293s


prompt_token_count: 79004
candidates_token_count: 795
total_token_count: 79799

Appended job detail to DataFrame.
Processed job ID: 58419


2024-11-26 18:10:52,836 - INFO - Completion time of gemini-1.5-flash-002: 7.379441022872925s


prompt_token_count: 78465
candidates_token_count: 428
total_token_count: 78893

Appended job detail to DataFrame.
Processed job ID: 58388


2024-11-26 18:11:04,649 - INFO - Completion time of gemini-1.5-flash-002: 9.645277976989746s


prompt_token_count: 78938
candidates_token_count: 561
total_token_count: 79499

Appended job detail to DataFrame.
Processed job ID: 58381


2024-11-26 18:11:13,992 - INFO - Completion time of gemini-1.5-flash-002: 7.089282035827637s


prompt_token_count: 78433
candidates_token_count: 415
total_token_count: 78848

Appended job detail to DataFrame.
Processed job ID: 58354


2024-11-26 18:11:25,669 - INFO - Completion time of gemini-1.5-flash-002: 9.405666589736938s


prompt_token_count: 79635
candidates_token_count: 640
total_token_count: 80275

Appended job detail to DataFrame.
Processed job ID: 58340


2024-11-26 18:11:37,009 - INFO - Completion time of gemini-1.5-flash-002: 8.935020685195923s


prompt_token_count: 78628
candidates_token_count: 484
total_token_count: 79112

Appended job detail to DataFrame.
Processed job ID: 58332


2024-11-26 18:11:47,969 - INFO - Completion time of gemini-1.5-flash-002: 8.758975982666016s


prompt_token_count: 78645
candidates_token_count: 520
total_token_count: 79165

Appended job detail to DataFrame.
Processed job ID: 58328
Found 10 jobs on page 17


2024-11-26 18:12:01,651 - INFO - Completion time of gemini-1.5-flash-002: 7.192215919494629s


prompt_token_count: 78424
candidates_token_count: 372
total_token_count: 78796

Appended job detail to DataFrame.
Processed job ID: 58309


2024-11-26 18:12:11,476 - INFO - Completion time of gemini-1.5-flash-002: 7.628685235977173s


prompt_token_count: 78693
candidates_token_count: 475
total_token_count: 79168

Appended job detail to DataFrame.
Processed job ID: 58291


2024-11-26 18:12:21,549 - INFO - Completion time of gemini-1.5-flash-002: 7.732708930969238s


prompt_token_count: 78758
candidates_token_count: 484
total_token_count: 79242

Appended job detail to DataFrame.
Processed job ID: 58268


2024-11-26 18:12:32,471 - INFO - Completion time of gemini-1.5-flash-002: 8.549067974090576s


prompt_token_count: 78550
candidates_token_count: 532
total_token_count: 79082

Appended job detail to DataFrame.
Processed job ID: 58254


2024-11-26 18:12:46,740 - INFO - Completion time of gemini-1.5-flash-002: 11.99820590019226s


prompt_token_count: 79358
candidates_token_count: 501
total_token_count: 79859

Appended job detail to DataFrame.
Processed job ID: 58246


2024-11-26 18:12:56,825 - INFO - Completion time of gemini-1.5-flash-002: 7.397046089172363s


prompt_token_count: 78483
candidates_token_count: 349
total_token_count: 78832

Appended job detail to DataFrame.
Processed job ID: 58234


2024-11-26 18:13:06,404 - INFO - Completion time of gemini-1.5-flash-002: 7.35339617729187s


prompt_token_count: 78595
candidates_token_count: 415
total_token_count: 79010

Appended job detail to DataFrame.
Processed job ID: 58231


2024-11-26 18:13:17,734 - INFO - Completion time of gemini-1.5-flash-002: 9.164103746414185s


prompt_token_count: 80504
candidates_token_count: 561
total_token_count: 81065

Appended job detail to DataFrame.
Processed job ID: 58198


2024-11-26 18:13:27,803 - INFO - Completion time of gemini-1.5-flash-002: 7.738248348236084s


prompt_token_count: 78530
candidates_token_count: 437
total_token_count: 78967

Appended job detail to DataFrame.
Processed job ID: 58197


2024-11-26 18:13:36,672 - INFO - Completion time of gemini-1.5-flash-002: 6.646711111068726s


prompt_token_count: 78326
candidates_token_count: 370
total_token_count: 78696

Appended job detail to DataFrame.
Processed job ID: 58192
Found 10 jobs on page 18


2024-11-26 18:13:52,109 - INFO - Completion time of gemini-1.5-flash-002: 9.216200828552246s


prompt_token_count: 79279
candidates_token_count: 606
total_token_count: 79885

Appended job detail to DataFrame.
Processed job ID: 58189


2024-11-26 18:14:08,317 - INFO - Completion time of gemini-1.5-flash-002: 13.70457410812378s


prompt_token_count: 78461
candidates_token_count: 359
total_token_count: 78820

Appended job detail to DataFrame.
Processed job ID: 58161


2024-11-26 18:14:20,670 - INFO - Completion time of gemini-1.5-flash-002: 10.153227090835571s


prompt_token_count: 81160
candidates_token_count: 708
total_token_count: 81868

Appended job detail to DataFrame.
Processed job ID: 58153


2024-11-26 18:14:30,279 - INFO - Completion time of gemini-1.5-flash-002: 7.401612043380737s


prompt_token_count: 82854
candidates_token_count: 401
total_token_count: 83255

Appended job detail to DataFrame.
Processed job ID: 58122


2024-11-26 18:14:41,926 - INFO - Completion time of gemini-1.5-flash-002: 9.476279020309448s


prompt_token_count: 78876
candidates_token_count: 440
total_token_count: 79316

Appended job detail to DataFrame.
Processed job ID: 58135


2024-11-26 18:14:51,940 - INFO - Completion time of gemini-1.5-flash-002: 7.734309196472168s


prompt_token_count: 78405
candidates_token_count: 455
total_token_count: 78860

Appended job detail to DataFrame.
Processed job ID: 58104


2024-11-26 18:15:02,774 - INFO - Completion time of gemini-1.5-flash-002: 8.539802312850952s


prompt_token_count: 78586
candidates_token_count: 512
total_token_count: 79098

Appended job detail to DataFrame.
Processed job ID: 58080


2024-11-26 18:15:13,069 - INFO - Completion time of gemini-1.5-flash-002: 8.052308082580566s


prompt_token_count: 79179
candidates_token_count: 526
total_token_count: 79705

Appended job detail to DataFrame.
Processed job ID: 58072


2024-11-26 18:15:23,974 - INFO - Completion time of gemini-1.5-flash-002: 8.693143129348755s


prompt_token_count: 78841
candidates_token_count: 498
total_token_count: 79339

Appended job detail to DataFrame.
Processed job ID: 58077


2024-11-26 18:15:34,801 - INFO - Completion time of gemini-1.5-flash-002: 8.653696060180664s


prompt_token_count: 78483
candidates_token_count: 403
total_token_count: 78886

Appended job detail to DataFrame.
Processed job ID: 58006
Found 10 jobs on page 19


2024-11-26 18:15:48,194 - INFO - Completion time of gemini-1.5-flash-002: 7.168761968612671s


prompt_token_count: 79070
candidates_token_count: 397
total_token_count: 79467

Appended job detail to DataFrame.
Processed job ID: 57988


2024-11-26 18:16:05,073 - INFO - Completion time of gemini-1.5-flash-002: 14.717318296432495s


prompt_token_count: 78441
candidates_token_count: 386
total_token_count: 78827

Appended job detail to DataFrame.
Processed job ID: 57926


2024-11-26 18:16:19,823 - INFO - Completion time of gemini-1.5-flash-002: 12.404889822006226s


prompt_token_count: 80716
candidates_token_count: 589
total_token_count: 81305

Appended job detail to DataFrame.
Processed job ID: 57915


2024-11-26 18:16:29,961 - INFO - Completion time of gemini-1.5-flash-002: 7.970347166061401s


prompt_token_count: 78693
candidates_token_count: 511
total_token_count: 79204

Appended job detail to DataFrame.
Processed job ID: 57879


2024-11-26 18:16:39,812 - INFO - Completion time of gemini-1.5-flash-002: 7.667087078094482s


prompt_token_count: 78675
candidates_token_count: 388
total_token_count: 79063

Appended job detail to DataFrame.
Processed job ID: 57873


2024-11-26 18:16:49,119 - INFO - Completion time of gemini-1.5-flash-002: 6.983450889587402s


prompt_token_count: 78768
candidates_token_count: 368
total_token_count: 79136

Appended job detail to DataFrame.
Processed job ID: 57858


2024-11-26 18:16:58,505 - INFO - Completion time of gemini-1.5-flash-002: 7.178839683532715s


prompt_token_count: 78302
candidates_token_count: 386
total_token_count: 78688

Appended job detail to DataFrame.
Processed job ID: 57843


2024-11-26 18:17:11,900 - INFO - Completion time of gemini-1.5-flash-002: 11.121387004852295s


prompt_token_count: 78569
candidates_token_count: 841
total_token_count: 79410

Appended job detail to DataFrame.
Processed job ID: 57836


2024-11-26 18:17:23,800 - INFO - Completion time of gemini-1.5-flash-002: 9.613389730453491s


prompt_token_count: 86881
candidates_token_count: 680
total_token_count: 87561

Appended job detail to DataFrame.
Processed job ID: 57835


2024-11-26 18:17:35,080 - INFO - Completion time of gemini-1.5-flash-002: 9.040120124816895s


prompt_token_count: 78618
candidates_token_count: 464
total_token_count: 79082

Appended job detail to DataFrame.
Processed job ID: 57832
Found 10 jobs on page 20


2024-11-26 18:17:48,564 - INFO - Completion time of gemini-1.5-flash-002: 6.833628177642822s


prompt_token_count: 78719
candidates_token_count: 374
total_token_count: 79093

Appended job detail to DataFrame.
Processed job ID: 57815


2024-11-26 18:17:58,589 - INFO - Completion time of gemini-1.5-flash-002: 7.869045972824097s


prompt_token_count: 80062
candidates_token_count: 514
total_token_count: 80576

Appended job detail to DataFrame.
Processed job ID: 57806


2024-11-26 18:18:08,786 - INFO - Completion time of gemini-1.5-flash-002: 7.872179985046387s


prompt_token_count: 79301
candidates_token_count: 488
total_token_count: 79789

Appended job detail to DataFrame.
Processed job ID: 57789


2024-11-26 18:18:19,260 - INFO - Completion time of gemini-1.5-flash-002: 7.577703237533569s


prompt_token_count: 78857
candidates_token_count: 454
total_token_count: 79311

Appended job detail to DataFrame.
Processed job ID: 57740


2024-11-26 18:18:29,493 - INFO - Completion time of gemini-1.5-flash-002: 7.910138130187988s


prompt_token_count: 80971
candidates_token_count: 459
total_token_count: 81430

Appended job detail to DataFrame.
Processed job ID: 57730


2024-11-26 18:18:43,500 - INFO - Completion time of gemini-1.5-flash-002: 11.498727798461914s


prompt_token_count: 78930
candidates_token_count: 874
total_token_count: 79804

Appended job detail to DataFrame.
Processed job ID: 57721


2024-11-26 18:18:56,653 - INFO - Completion time of gemini-1.5-flash-002: 10.711904048919678s


prompt_token_count: 78759
candidates_token_count: 460
total_token_count: 79219

Appended job detail to DataFrame.
Processed job ID: 57706


2024-11-26 18:19:07,392 - INFO - Completion time of gemini-1.5-flash-002: 8.353074789047241s


prompt_token_count: 80881
candidates_token_count: 518
total_token_count: 81399

Appended job detail to DataFrame.
Processed job ID: 43915


2024-11-26 18:19:16,662 - INFO - Completion time of gemini-1.5-flash-002: 7.095476865768433s


prompt_token_count: 78930
candidates_token_count: 391
total_token_count: 79321

Appended job detail to DataFrame.
Processed job ID: 43194


2024-11-26 18:19:27,841 - INFO - Completion time of gemini-1.5-flash-002: 8.476579904556274s


prompt_token_count: 78931
candidates_token_count: 568
total_token_count: 79499

Appended job detail to DataFrame.
Processed job ID: 57671
Data saved to CSV.
Data saved to test_topjob.csv


In [35]:
df = scraper.df


In [38]:

placeholders = ['Not available', 'No requirement', 'Not required', "No experience", "No degree", "No certificate","not available"]
df.replace(placeholders, "No requirement", inplace=True)

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    return text.replace("\n", " ").replace("\t", " ").strip().lower()

columns_to_clean = [
    "Candidate_Experience_Requirements",
    "Candidate_soft_skill_Requirements",
    "Candidate_technical_skill_Requirements",
    "Candidate_degree_Requirements"
]

for col in columns_to_clean:
    if col in df.columns:
        df[col] = df[col].apply(clean_text)

date_columns = ["Job_Date-open", "Job_Date-end"]
for col in date_columns:
    if col in df.columns:
        df[col] = df[col].str.replace(" ", "-", regex=False)

def extract_min_salary(salary):
    if pd.isna(salary):
        return pd.NA
    numbers = [int(num.replace(',', '')) for num in salary.split() if num.replace(',', '').isdigit()]
    return numbers[0] if numbers else pd.NA

df['Salary_Min'] = df['Salary'].apply(extract_min_salary)
df['Salary_Min'] = pd.to_numeric(df['Salary_Min'], errors='coerce').astype('Int64')


In [33]:
df.to_csv("topjob.csv")

In [10]:
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained("BAAI/llm-embedder")
model_embed = AutoModel.from_pretrained("BAAI/llm-embedder")
model_embed.eval()

def last_token_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])  
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def embed_text(text):
    if pd.isna(text) or text.lower().strip() == "no requirement":
        return "no requirement"  
    inputs = tokenizer(str(text), return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        out = model_embed(**inputs)
        embeddings = last_token_pool(out.last_hidden_state, inputs['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1)
    return embeddings.tolist()

file_path = '/Users/duongphuonggiang/Documents/VietCvProcessor/crawl/topjob.csv'
df = pd.read_csv(file_path)

In [15]:
import os
from tqdm import tqdm
import pandas as pd
import json

placeholders = ['Not available', 'No requirement', 'Not required', "No experience", "No degree", "No certificate"]
df.replace(placeholders, pd.NA, inplace=True)

def generate_embedding_for_column(df, col_name):
    tqdm.pandas(desc=f"Embedding {col_name}")
    return df[col_name].progress_apply(embed_text)

df["Candidate_Experience_Requirements_embedding"] = generate_embedding_for_column(df, "Candidate_Experience_Requirements")
df["Candidate_soft_skill_Requirements_embedding"] = generate_embedding_for_column(df, "Candidate_soft_skill_Requirements")
df["Candidate_technical_skill_Requirements_embedding"] = generate_embedding_for_column(df, "Candidate_technical_skill_Requirements")
df["Candidate_degree_Requirements_embedding"] = generate_embedding_for_column(df, "Candidate_degree_Requirements")

def create_mappings(df):
    mappings = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Creating mappings"):
        job_info = {
            "JOB_ID": row.get("Job ID"),
            "location": row.get("Job_Location"),
            "date_opened": row.get("Job_Date-open"),
            "deadline": row.get("Job_Date-end"),
            "title": row.get("Title"),
            "description": row.get("Candidate_Experience_Requirements"),
            "companyID": row.get("Job ID"),
            "job information": {
                "experience_embedding": row.get("Candidate_Experience_Requirements_embedding", "no requirement"),
                "soft_skill_embedding": row.get("Candidate_soft_skill_Requirements_embedding", "no requirement"),
                "technical_skill_embedding": row.get("Candidate_technical_skill_Requirements_embedding", "no requirement"),
                "degree_embedding": row.get("Candidate_degree_Requirements_embedding", "no requirement")
            },
            "workingType": row.get("Job_Type"),
            "workingTime": row.get("Job_Category"),
            "salary_month": row.get("Salary_Min"),
            "sectorID": row.get("Job ID"),
        }
        mappings.append({"mappings": {"properties": job_info}})
    
    return mappings

job_mappings = create_mappings(df)

output_dir = 'crawl'
output_file = os.path.join(output_dir, 'job_mappings.json')
os.makedirs(output_dir, exist_ok=True)


Embedding Candidate_Experience_Requirements:   0%|          | 0/200 [00:00<?, ?it/s]

Embedding Candidate_Experience_Requirements:  16%|█▌        | 31/200 [00:02<00:13, 12.57it/s]


KeyboardInterrupt: 