In [14]:
# !source my_env/bin/activate

In [2]:
! python -V

Python 3.10.14


In [3]:
# Install the packages
! pip3 install --upgrade --quiet google-cloud-aiplatform
! pip install google-cloud-discoveryengine



In [4]:
# Setup project ID and location
PROJECT_ID = "cacafly-ml-specialization"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

In [5]:
import vertexai
from vertexai.language_models import CodeGenerationModel

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [6]:
import sys
import json
import os
import vertexai
from typing import Dict, List, Optional, Tuple
from google.cloud import discoveryengine
from google.protobuf.json_format import MessageToDict

### Set Up Vertex AI Search Engine and Get Search Engine Id


在 Agent Builder 上建立一個 Data Store, index websites 並且複製 engine_id

網站使用範例提供的：support.google.com/google-ads/*

In [7]:
search_engine_id = "demo3_1733212957986"
serving_config_id = "default_config"

### Import dataset and related package (NEW)

data source: https://www.cs.cornell.edu/people/pabo/movie-review-data/

ref code source: https://www.kaggle.com/code/esraaaabdelrazek/moviereviewclassification/notebook

In [8]:

import tarfile

import numpy as np 
import pandas as pd 
from sklearn.datasets import load_files
import nltk 
from nltk.corpus import stopwords

In [9]:
movie_data = load_files('txt_sentoken/')
X , y = movie_data.data , movie_data.target

In [10]:
X[0]

b"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . \nit's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? \nonce again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . \nin this so called dark thriller , the devil ( gabriel byrne ) has come upon earth , to impregnate a woman ( robin tunney ) which happens every 1000 years , and basically destroy the world , but apparently god has chosen one man , and that one man is jericho cane ( arnold himself ) . \nwith the help of a trusty sidekick ( kevin pollack ) , they will stop at nothing to let the devil take over the world ! \nparts of this are actually so absurd , that they would fit right in with dogma . \nyes , the film is 

## split review to 2 paragraph and combine randomly

In [20]:
import json
import random
from tqdm import tqdm


def split_review(text, min_split=5):
    if isinstance(text, bytes):
        text = text.decode("utf-8")
    
    paragraphs = [p.strip() for p in text.strip().split('.') if p.strip()]
    if len(paragraphs) < min_split:
        mid = len(paragraphs) // 2
    else:
        total_len = sum(len(p) for p in paragraphs)
        acc_len = 0
        for i, p in enumerate(paragraphs):
            acc_len += len(p)
            if acc_len >= total_len / 2:
                mid = i + 1
                break

    return "\n".join(paragraphs[:mid]), "\n".join(paragraphs[mid:])

def split_review_random_choose_sentence(text, min_split=5, sample_ratio=0.4):
    if isinstance(text, bytes):
        text = text.decode("utf-8")

    # 切段落
    paragraphs = [p.strip() for p in text.strip().split('.') if p.strip()]

    # 分前後段
    if len(paragraphs) < min_split:
        mid = len(paragraphs) // 2
    else:
        total_len = sum(len(p) for p in paragraphs)
        acc_len = 0
        for i, p in enumerate(paragraphs):
            acc_len += len(p)
            if acc_len >= total_len / 2:
                mid = i + 1
                break

    # 分句
    front_sents = paragraphs[:mid]
    back_sents = paragraphs[mid:]


    # 各段要抽的句數（保底抽 1 句）
    n_front = max(1, round(len(front_sents) * sample_ratio)) if front_sents else 0
    n_back = max(1, round(len(back_sents) * sample_ratio)) if back_sents else 0
    
    # 隨機抽 index 並保持順序
    front_idx = sorted(random.sample(range(len(front_sents)), n_front)) if n_front > 0 else []
    back_idx = sorted(random.sample(range(len(back_sents)), n_back)) if n_back > 0 else []

    selected_front = [front_sents[i] for i in front_idx]
    selected_back = [back_sents[i] for i in back_idx]


    return "\n".join(selected_front), "\n".join(selected_back)

def build_finetune_data(X, y, train_path="train.jsonl", valid_path="eval.jsonl", sample_size_each=500):
    pos_reviews = [r for r, label in zip(X, y) if label == 1]
    neg_reviews = [r for r, label in zip(X, y) if label == 0]

    assert len(pos_reviews) >= sample_size_each * 2, "Not enough positive reviews"
    assert len(neg_reviews) >= sample_size_each * 2, "Not enough negative reviews"

    pos_yes = random.sample(pos_reviews, sample_size_each) #[:50]
    neg_yes = random.sample(neg_reviews, sample_size_each) #[:50]

    yes_samples = []
    
    print('yes samples...')
    
    for review in tqdm(pos_yes + neg_yes):

        a, b = split_review_random_choose_sentence(review)

        prompt = f"Paragraph A: {a}\nParagraph B: {b}\n\nAre these paragraphs from the same movie review? Answer yes or no."
        yes_samples.append({
            "contents": [
                {"role": "user", "parts": [{"text": prompt}]},
                {"role": "model", "parts": [{"text": "yes"}]}
            ]
        })

    pos_no_pool = [r for r in pos_reviews if r not in pos_yes]
    neg_no_pool = [r for r in neg_reviews if r not in neg_yes]
    combined_no_pool = pos_no_pool + neg_no_pool
    random.shuffle(combined_no_pool)

    print('no samples...')
    
    paras = [split_review_random_choose_sentence(r) for r in tqdm(combined_no_pool)]

    paras_a = [p[0] for p in paras]
    paras_b = [p[1] for p in paras]
    random.shuffle(paras_b)

    no_samples = []
    for a, b in zip(paras_a, paras_b):
        if a.strip() != b.strip():
            prompt = f"Paragraph A: {a}\nParagraph B: {b}\n\nAre these paragraphs from the same movie review? Answer yes or no."
            no_samples.append({
                "contents": [
                    {"role": "user", "parts": [{"text": prompt}]},
                    {"role": "model", "parts": [{"text": "no"}]}
                ]
            })

    all_samples = yes_samples + no_samples
    random.shuffle(all_samples)

    split_idx = int(len(all_samples) * 0.7)
    train_set = all_samples[:split_idx]
    valid_set = all_samples[split_idx:]

    with open(train_path, "w", encoding="utf-8") as f:
        for item in train_set:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    with open(valid_path, "w", encoding="utf-8") as f:
        for item in valid_set:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    print(f"✅ Train set: {len(train_set)} samples -> {train_path}")
    print(f"✅ Validation set: {len(valid_set)} samples -> {valid_path}")


## import data with removed names

In [21]:

import json

# JSONL 檔案路徑
file_path = 'remove_name_comments.jsonl'

data = []

# 讀取每一行 JSON
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        obj = json.loads(line)
        try:
            # 安全取得 id
            id_val = int(obj["id"])

            # 嘗試從 content.parts 取得文字
            content = obj["response"]["candidates"][0]["content"]
            if isinstance(content, dict) and "parts" in content:
                text = content["parts"][0]["text"]
            elif isinstance(content, str):
                text = content
            else:
                text = "(無法解析內容)"

            data.append({"id": id_val, "text": text})

        except Exception as e:
            print(f"跳過一筆資料（可能格式異常）：{e}")

# 根據 id 排序
data_sorted = sorted(data, key=lambda x: x["id"])

# 輸出清理後的 text
# for item in data_sorted[:10]:
#     print(item)
#     print("\n---\n")  # 可選：加分隔線區分段落


In [22]:
new_X = [item["text"] for item in data_sorted]


In [23]:

build_finetune_data(new_X, y, train_path="train_rv_name_random_sent.jsonl", valid_path="eval_rv_name_random_sent.jsonl")


yes samples...


100%|██████████| 1000/1000 [00:00<00:00, 29781.48it/s]


no samples...


100%|██████████| 1000/1000 [00:00<00:00, 33191.45it/s]

✅ Train set: 1400 samples -> train_rv_name_random_sent_0701.jsonl
✅ Validation set: 600 samples -> eval_rv_name_random_sent_0701.jsonl





In [24]:
import json

from vertexai.generative_models import GenerativeModel
from vertexai import generative_models

from google import genai
from google.genai.types import GenerateContentConfig, ThinkingConfig



model = GenerativeModel("gemini-2.0-flash", 
                        safety_settings=[
                        generative_models.SafetySetting(
                            category=generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
                            threshold=generative_models.HarmBlockThreshold.BLOCK_NONE
                        ),
                        generative_models.SafetySetting(
                            category=generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
                            threshold=generative_models.HarmBlockThreshold.BLOCK_NONE
                        ),
                    ]
        )


# 設定檔案路徑
jsonl_path = "eval_rv_name_random_sent.jsonl"


# 讀取前 10 筆資料
samples = []
answers = []

corr = 0
eval_num = 600

with open(jsonl_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):

        if i >= eval_num:
            break
        data = json.loads(line)
        prompt = data["contents"][0]["parts"][0]["text"] 
        answer = data["contents"][1]["parts"][0]["text"]
        samples.append(prompt)
        answers.append(answer)

from tqdm import tqdm

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)
# 執行 Gemini 推論
for idx, prompt in enumerate(tqdm(samples)):
    # print(f"--- Sample {idx} ---")
    response = model.generate_content(prompt)

    try:
        if 'no' in response.text.strip().lower() and answers[idx] == 'no':
            corr += 1
        elif 'yes' in response.text.strip().lower() and answers[idx] == 'yes':
            corr += 1
    except:
        pass

print('Acc', corr/len(samples))
# print(answers, len(answers))

100%|██████████| 600/600 [03:25<00:00,  2.92it/s]

Acc 0.8633333333333333





## Start Fine-tuning (based on Gemini-2.0-flash)

In [83]:

import time

# import vertexai
from vertexai.tuning import sft

# TODO(developer): Update and un-comment below line
# PROJECT_ID = "your-project-id"
# vertexai.init(project=PROJECT_ID, location=LOCATION)

sft_tuning_job = sft.train(
    source_model="gemini-2.0-flash-001",
    train_dataset="gs://cacafly-ml-specialization-dataset/movie_review/train_rv_name_random_sent.jsonl",

    epochs=5,
    adapter_size=4,
    learning_rate_multiplier=3.0,
    tuned_model_display_name="tuned_rv_name_random_sent_gemini_2.0-flash",
)

# Polling for job completion
while not sft_tuning_job.has_ended:
    time.sleep(60)
    sft_tuning_job.refresh()

print(sft_tuning_job.tuned_model_name)
print(sft_tuning_job.tuned_model_endpoint_name)
print(sft_tuning_job.experiment)


Creating SupervisedTuningJob
SupervisedTuningJob created. Resource name: projects/765271398193/locations/us-central1/tuningJobs/4672263491300622336
To use this SupervisedTuningJob in another session:
tuning_job = sft.SupervisedTuningJob('projects/765271398193/locations/us-central1/tuningJobs/4672263491300622336')
View Tuning Job:
https://console.cloud.google.com/vertex-ai/generative/language/locations/us-central1/tuning/tuningJob/4672263491300622336?project=765271398193


projects/765271398193/locations/us-central1/models/8021864212928135168@1
projects/765271398193/locations/us-central1/endpoints/395310714069188608
<google.cloud.aiplatform.metadata.experiment_resources.Experiment object at 0x7f2774ae41f0>


In [84]:
pass

## Evaluate the fine-tuned model

In [26]:

import time

# import vertexai
from vertexai.tuning import sft
from vertexai.generative_models import GenerativeModel

sft_tuning_job = sft.SupervisedTuningJob("projects/765271398193/locations/us-central1/tuningJobs/8472738626847899648") # rv name and choose 40% sentence

tuned_model = GenerativeModel(sft_tuning_job.tuned_model_endpoint_name)

import json

from vertexai.generative_models import GenerativeModel


jsonl_path = "eval_rv_name_random_sent.jsonl"

# 讀取前 10 筆資料
samples = []
answers = []

corr = 0
eval_num = 600

with open(jsonl_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= eval_num:
            break
        data = json.loads(line)
        # 假設你要測試的 prompt 在 contents[0]["parts"][0]["text"]
        prompt = data["contents"][0]["parts"][0]["text"]
        answer = data["contents"][1]["parts"][0]["text"]
        samples.append(prompt)
        answers.append(answer)

from tqdm import tqdm

# 執行 Gemini 推論
for idx, prompt in enumerate(tqdm(samples)):
    response = tuned_model.generate_content(prompt)
    try:
        if 'no' in response.text.strip().lower() and answers[idx] == 'no':
            corr += 1
        elif 'yes' in response.text.strip().lower() and answers[idx] == 'yes':
            corr += 1
    except:
        pass

print('Acc', corr/len(samples))

100%|██████████| 600/600 [03:08<00:00,  3.19it/s]

Acc 0.95



