In [1]:
from google.colab import drive
drive.mount('/content/drive')

HUGGINGFACE_TOKEN="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"
from huggingface_hub import login

login(token=HUGGINGFACE_TOKEN)

Mounted at /content/drive


In [2]:
!pip install transformers==4.43.0 accelerate

# Download Microsoft's Phi-3-mini-4k-instruct model
!huggingface-cli download microsoft/Phi-3-mini-4k-instruct --local-dir /content/Phi-3-mini-4k-instruct

Collecting transformers==4.43.0
  Downloading transformers-4.43.0-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.43.0)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux201

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import os

# Define model path
model_id = "/content/Phi-3-mini-4k-instruct"

# Fix config.json to handle any potential issues (e.g., rope_scaling)
config_path = os.path.join(model_id, "config.json")
if os.path.exists(config_path):
    with open(config_path, "r") as f:
        config = json.load(f)
    # Ensure rope_scaling is properly formatted or removed
    if "rope_scaling" in config and config["rope_scaling"] is not None:
        if "type" not in config["rope_scaling"]:
            config["rope_scaling"]["type"] = "linear"  # Default to linear scaling
        config["rope_scaling"]["factor"] = config["rope_scaling"].get("factor", 1.0)
    else:
        config["rope_scaling"] = None  # Disable rope_scaling if not needed
    with open(config_path, "w") as f:
        json.dump(config, f, indent=2)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set pad_token_id to eos_token_id if not already set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Load the Phi model, ensuring it's on CUDA
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",  # Explicitly place model on GPU
    trust_remote_code=True,
    torch_dtype=torch.float16  # Optimize for T4 GPU
).to("cuda")  # Ensure model is moved to GPU

# Prepare input with attention mask
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"}
]
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
input_ids = inputs.to("cuda")
attention_mask = input_ids.ne(tokenizer.pad_token_id).to("cuda")  # Create attention mask

# Generate output with keyword arguments and sampling enabled
outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=256,
    temperature=0.6,
    top_p=0.9,
    do_sample=True  # Enable sampling for temperature and top_p
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


You are a pirate chatbot who always responds in pirate speak! Who are you? Yarrr, me hearties! I be a digital buccaneer, here to assist ye in yer quest for knowledge and information. I be a pirate chatbot, ready to parley with ye in the tongue of the sea dogs!


In [4]:
import pandas as pd
import re

def remove_first_number_group(text):
    # Match either:
    # 1. A group of 2–5 space-separated digit blocks
    # 2. OR a single long number (8+ digits)
    match = re.search(r'((\d+\s+){1,4}\d+|\d{8,})', text)
    if match:
        return text[:match.start()].rstrip() + ' ' + text[match.end():].lstrip()
    return text

file_path = '/content/drive/MyDrive/preprocessed_cv_text_data.csv'
df = pd.read_csv(file_path)
df['Combined_Text'] = df['Category'] + " " + df['Processed_Text']
df['Combined_Text'] = df['Combined_Text'].str.replace('-', ' ', regex=True)
df['Combined_Text'] = df['Combined_Text'].str.replace(r'\s+', ' ', regex=True).str.strip()
df['Combined_Text'] = df['Combined_Text'].apply(remove_first_number_group)
df.head()

def chunk_text(text, max_length=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_length
        chunks.append(text[start:end])
        start += max_length - overlap
    return chunks

all_chunks = []

for idx, row in df.iterrows():
    chunks = chunk_text(row['Combined_Text'])
    for i, chunk in enumerate(chunks):
        all_chunks.append({
            'cv_id': idx,
            'chunk_id': i,
            'text': chunk
        })

# Create DataFrame containing chunks
chunk_df = pd.DataFrame(all_chunks)
chunk_df.head()

chunk_df.to_csv('/content/drive/MyDrive/chunk_cv_text_data.csv')

In [11]:
df.head()

Unnamed: 0,Category,Text,Processed_Text,Combined_Text
0,-net-developer,Contact [email protected] +61 412 345 678 Skil...,61 412 345 678 skill c net framework aspnet sq...,net developer skill c net framework aspnet sql...
1,-net-developer,Contact [email protected] +55 (11) 98765-4321 ...,55 11 987654321 skill c net framework aspnet c...,net developer skill c net framework aspnet cor...
2,-net-developer,Contact [email protected] +39 02 1234 5678 Ski...,39 02 1234 5678 skill c net framework aspnet s...,net developer skill c net framework aspnet sql...
3,-net-web-developer,Contact [email protected] +55 11 91234-5678 Sk...,55 11 912345678 skill aspnet c mvc agile metho...,net web developer skill aspnet c mvc agile met...
4,-net-web-developer,Contact [email protected] +81 90-1234-5678 Ski...,81 9012345678 skill c net aspnet mvc sql serve...,net web developer skill c net aspnet mvc sql s...


In [8]:
def generate_query_from_document(cv_id, model, tokenizer, df):
    """
    Generate a concise search query representing the content of a document.

    Args:
        cv_id (str or int): The ID of the document.
        model (transformers.PreTrainedModel): The language model for generation.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer associated with the model.
        df (pd.DataFrame): A DataFrame containing a 'Combined_Text' column indexed by cv_id.

    Returns:
        str: A concise search query.
    """
    # Lấy nội dung tài liệu từ DataFrame
    try:
        contextual_document = df.loc[cv_id, 'Combined_Text']
    except KeyError:
        raise ValueError(f"cv_id={cv_id} does not exist in df")

    # Prompt mới để sinh truy vấn
    prompt = f"""
You are a helpful assistant. Your task is to generate a short, relevant search query that captures the main idea or purpose of the following document. The query should be concise, focused, and help retrieve this document effectively in a search system. Return only the query, nothing else.

Document:
<document>
{contextual_document}
</document>

"""

    messages = [{"role": "user", "content": prompt}]

    # Tokenize và chuẩn bị input
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )
    input_ids = inputs.to("cuda")
    attention_mask = input_ids.ne(tokenizer.pad_token_id).to("cuda")

    # Sinh output từ mô hình
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=32,  # Vì query cần ngắn gọn
        do_sample=True,
        top_p=0.9,
        temperature=0.7
    )

    # Decode và làm sạch kết quả
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "assistant" in response:
        response = response.split("assistant", 1)[-1].strip()

    response = response.strip()

    response = response.strip()
    delimiter = "</document>"
    index = response.find(delimiter)

    content_after = response[index + len(delimiter):].strip()
    return content_after


In [9]:
cv_id = 0  # Example cv_id
response = generate_query_from_document(cv_id, model, tokenizer, df)
print(response)

"Senior Net Developer with ASP.NET and SQL Server Experience"


In [12]:
import os
import pandas as pd
from tqdm import tqdm

def generate_and_save_queries(df, model, tokenizer, output_csv_path='/content/drive/MyDrive/generated_queries.csv'):
    """
    Generate a query for each document (indexed by cv_id) and save to CSV.

    Args:
        df (pd.DataFrame): DataFrame with 'Combined_Text' and cv_id as index.
        model: Language model used for generation.
        tokenizer: Tokenizer for the model.
        output_csv_path (str): Path to save the results.

    Returns:
        pd.DataFrame: The final DataFrame containing generated queries.
    """
    # Use index as cv_id list
    cv_ids = df.index.tolist()

    # Load previously saved queries if any
    if os.path.exists(output_csv_path):
        existing_df = pd.read_csv(output_csv_path)
        processed_ids = set(existing_df['cv_id'])
    else:
        processed_ids = set()
        pd.DataFrame(columns=['cv_id', 'query']).to_csv(output_csv_path, index=False)

    for cv_id in tqdm(cv_ids, desc="Generating queries"):
        if cv_id in processed_ids:
            continue

        try:
            query = generate_query_from_document(cv_id, model, tokenizer, df)
            new_row = pd.DataFrame([{
                'cv_id': cv_id,
                'query': query
            }])
            new_row.to_csv(output_csv_path, mode='a', header=False, index=False)
            processed_ids.add(cv_id)

        except Exception as e:
            print(f"Error processing cv_id={cv_id}: {e}")
            continue

    print(f"Queries saved to {output_csv_path}")
    return pd.read_csv(output_csv_path)


In [None]:
output_csv_path = '/content/drive/MyDrive/generated_queries.csv'
query_df = generate_and_save_queries(df, model, tokenizer, output_csv_path)
print(query_df.head())

Generating queries:  72%|███████▏  | 5916/8221 [2:02:21<51:34,  1.34s/it]