In [2]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os

import torch

import numpy as np

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import BartTokenizer, BartForConditionalGeneration, DataCollatorForSeq2Seq
from transformers import pipeline

In [4]:
NEWS_DATA_PATH = 'drive/My Drive/Colab Notebooks/期末/BBC News Summary/News Articles'
SUMMARIES_DATA_PATH = 'drive/My Drive/Colab Notebooks/期末/BBC News Summary/Summaries'

# 設定模型儲存路徑
MODEL_SAVE_PTH = 'drive/My Drive/Colab Notebooks/期末/5.五個分類_2個epoch_滑動平均損失和動態調整權重'

# 設定設備為 GPU (如果可用的話) 或 CPU
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 設定輸入和目標(摘要)的最大長度
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128

# 設定批次大小
BATCH_SIZE = 8

# 定義情緒標籤
SENTIMENT_LABELS = ["happy", "sad", "warn", "angry", "sorrow", "alert", "neutral"]

# 建立情緒標籤到索引的對應字典
SENTIMENT_LABELS_MAP = {label: index for index, label in enumerate(SENTIMENT_LABELS)}



# 加入權重
summary_loss_weight = 0.5
sentiment_loss_weight = 0.5
prev_summary_loss = None
prev_sentiment_loss = None
avg_summary_loss = 0
avg_sentiment_loss = 0

**從root_path 加載數據**

根路徑下的每個子目錄(代表一個類別)，子目錄中的每個文件包含新聞內容。

返回一個包含所有新聞內容的 numpy 字符串數組。

News Articles：包含不同類別的新聞文章，每個類別是一個子資料夾，內有多個文件。

Summaries：包含每篇新聞文章的五個摘要。

pbar = tqdm(os.listdir(news_dir_path))：

這行程式碼將 news_dir_path 目錄中的所有文件名稱傳遞給 tqdm，以初始化進度條 pbar。在迴圈中使用 pbar 可以在迴圈每次迭代時更新進度條。



*   os.listdir(root_path) 返回根目錄下的所有子目錄名稱，假設每個子目錄代表一個類別。
*   os.path.join(root_path, cls) 獲取每個子目錄的完整路徑。


In [5]:
# 定義資料載入函數
def load_data(root_path):
    print('Loading: {}'.format(root_path))  # 打印正在載入的路徑

    data = []  # 初始化數據列表，用來存儲加載的數據
    classes = sorted(os.listdir(root_path))  # 確保類別按順序排序
    # 遍歷根目錄下的每個子目錄，假設每個子目錄代表一個類別。
    # for cls in os.listdir(root_path):
    for cls in classes:  # 確保類別的順序一致
        news_dir_path = os.path.join(root_path, cls)  # 獲取類別資料夾的路徑

        files = sorted(os.listdir(news_dir_path))  # 確保文件按順序排序

        pbar = tqdm(files)   # 初始化進度條，用來顯示加載每個子目錄中文件的進度。

        # 遍歷類別資料夾中的文件
        for news_file_name in pbar:
            news_file_path = os.path.join(news_dir_path, news_file_name)  # 獲取新聞文件的路徑

            # 打開新聞文件並讀取內容
            with open(news_file_path, 'r', encoding='unicode_escape') as file:
                lines = file.read().strip()
                if lines:
                    data.append(lines.replace('\n', ' '))  # 將內容中的換行符替換為空格，並添加到數據列表中。
                else:
                    print(f"Empty file found: {news_file_path}")

            # 更新進度條的後綴資訊
            if news_file_name == pbar.iterable[-1]: # 如果當前文件是目錄中的最後一個文件，更新進度條的後綴信息。
                pbar.set_postfix_str('{} class loaded'.format(cls)) # 設置進度條的後綴信息，顯示當前類別已加載完成。

    return np.array(data, dtype=str)  # 返回數據陣列

In [6]:
# 載入新聞語料
news_corpus = load_data(NEWS_DATA_PATH)

# 載入摘要語料
summaries_corpus = load_data(SUMMARIES_DATA_PATH)

Loading: drive/My Drive/Colab Notebooks/期末/BBC News Summary/News Articles


100%|██████████| 510/510 [00:36<00:00, 14.02it/s, business class loaded]
100%|██████████| 386/386 [00:04<00:00, 91.77it/s, entertainment class loaded] 
100%|██████████| 417/417 [00:23<00:00, 17.43it/s, politics class loaded]
100%|██████████| 511/511 [00:46<00:00, 10.96it/s, sport class loaded]
100%|██████████| 401/401 [00:34<00:00, 11.61it/s, tech class loaded]


Loading: drive/My Drive/Colab Notebooks/期末/BBC News Summary/Summaries


100%|██████████| 510/510 [00:55<00:00,  9.13it/s, business class loaded]
100%|██████████| 386/386 [00:34<00:00, 11.33it/s, entertainment class loaded]
100%|██████████| 417/417 [00:52<00:00,  7.98it/s, politics class loaded]
100%|██████████| 511/511 [00:16<00:00, 30.19it/s, sport class loaded]
100%|██████████| 401/401 [00:18<00:00, 21.33it/s, tech class loaded]


In [7]:
# 分割資料集為訓練集和測試集
# 5% 的資料用作測試集，剩下的 95% 用作訓練集
X_train, X_test, y_train, y_test = train_test_split(news_corpus, summaries_corpus, test_size=0.05, random_state=42)

# 將訓練集進一步分割為訓練集和驗證集
# 之前的訓練集再一次分割成新的訓練集和驗證集。
# test_size=0.05 表示將 5% 的訓練集數據用作驗證集，剩下的 95% 繼續用作訓練集。
X_train, X_vaild, y_train, y_vaild = train_test_split(X_train, y_train, test_size=0.05, random_state=42)

In [8]:
# 計算測試集、驗證集和訓練集的長度
len(X_test), len(X_vaild), len(X_train)

(112, 106, 2007)

**自定義 PyTorch 的 Dataset 類，用於處理新聞和摘要數據。**

初始化數據集，包括輸入數據（新聞）和目標數據（摘要）。
定義數據集的大小和索引操作。

返回值：

    __len__ 返回數據集的大小。

    __getitem__ 根據索引返回對應的數據對（新聞和摘要）。

In [9]:
# 定義資料集類別
class CorpusDataset(Dataset):
    def __init__(self, X_corpus, y_corpus):
        self.X_corpus = X_corpus  # 輸入數據
        self.y_corpus = y_corpus  # 目標數據

    def __len__(self):
        return len(self.X_corpus)  # 返回數據集的大小

    def __getitem__(self, index):
        return self.X_corpus[index], self.y_corpus[index]  # 根據索引返回對應的數據對


In [10]:
# 創建訓練、驗證和測試資料集
train_model_inputs = CorpusDataset(X_train, y_train)
vaild_model_inputs = CorpusDataset(X_vaild, y_vaild)
test_model_inputs = CorpusDataset(X_test, y_test)

1. 程式碼初始化已定義 BATCH_SIZE = 8


2. drop_last=True

*   如果最後一個批次數據不夠，則丟棄
*   如果最後一個批次數據不夠，則丟棄















pinned memory（固定內存）

 page-locked memory。內存的數據不會被交換到磁盤上（即不會被置換出內存），因此 GPU 可以更快地訪問和複製這些數據。

*   GPU 進行深度學習訓練，需要將數據從 CPU 內存複製到 GPU 內存
*   CPU 固定內存（pinned memory）確保資料不會被置換出內存
*   更高效的 DMA（Direct Memory Access，直接內存訪問）操作
*   載入資料DataLoader 會將數據預先加載到 CPU 的固定內存區域，可以更快地傳輸到 GPU 內存中進行計算

**什麼情況下使用** `pin_memory=True` 有幫助

1. **頻繁數據傳輸**：需要頻繁地將大批量數據從 CPU 傳輸到 GPU，例如在每個訓練批次中，使用 `pin_memory=True` 可以顯著提高數據傳輸速度。
   
2. **大數據量**：當處理的數據量非常大時，固定內存可以減少數據傳輸過程中的瓶頸。

3. **使用GPU**：當使用 GPU 進行訓練時，`pin_memory=True` 尤其有幫助，可以加速數據從 CPU 到 GPU 的傳輸。

使用 `pin_memory=True` 時，確實可以提高數據從 CPU 傳輸到 GPU 的速度，但這並不意味著在所有情況下都能顯著提升性能。

1. **內存消耗**：使用 `pin_memory=True` 會佔用更多的 CPU 內存，如果你的系統內存有限，這可能會導致內存不足的情況。

2. **小批次數據**：如果你的批次大小很小，或者數據傳輸的頻率不高，使用 `pin_memory=True` 可能不會帶來顯著的性能提升。

3. **數據加載瓶頸**：`pin_memory=True` 主要優化的是數據從 CPU 到 GPU 的傳輸速度。如果你的數據加載過程（例如從磁盤讀取數據）是主要瓶頸，這個選項不會對這部分的性能產生影響。

**內存資源允許的情況下，這是一個很好的選擇，可以減少數據傳輸的延遲，提升整體訓練效率。**

**然而，對於小批次數據或內存有限的環境，提升效果可能不明顯。**


In [11]:
# 創建訓練資料加載器
train_dataloader = DataLoader(
    train_model_inputs,
    # 避免局部最小
    shuffle=True,  # 訓練時打亂數據
    batch_size=BATCH_SIZE,  # 每個批次的大小
    drop_last=True,  # 如果最後一個批次數據不夠，則丟棄
    pin_memory=True  # 將數據複製到 CUDA 固定內存
)

# 創建驗證資料加載器
eval_dataloader = DataLoader(
    vaild_model_inputs,
    shuffle=False,  # 驗證時不打亂數據
    batch_size=BATCH_SIZE,  # 每個批次的大小
    drop_last=True,  # 如果最後一個批次數據不夠，則丟棄
    pin_memory=True  # 將數據複製到 CUDA 固定內存
)

# 創建測試資料加載器
test_dataloader = DataLoader(
    test_model_inputs,
    shuffle=False,  # 測試時不打亂數據
    batch_size=1,  # 每個批次的大小為1
    drop_last=True,  # 如果最後一個批次數據不夠，則丟棄
    pin_memory=True  # 將數據複製到 CUDA 固定內存
)

初始化:設定設備為 GPU (如果可用的話) 或 CPU

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


*   torch.cuda.is_available() 檢查當前系統是否有可用的 GPU。
*   有可用的 GPU（torch.cuda.is_available() 返回 True），則 DEVICE 被設置為 torch.device("cuda")，這表示後續的運算將在 GPU 上進行。
*   沒有可用的 GPU，則 DEVICE 被設置為 torch.device("cpu")，這表示後續的運算將在 CPU 上進行。




**pipeline 支持多種 NLP 任務，自動處理文本標記化和模型推理。**

使用方法：pipeline("任務名稱")

**零樣本分類（zero-shot classification） 可以在沒有專門針對目標類別進行訓練的情況下進行分類。**

*   透過自然語言推理（Natural Language Inference, NLI）判斷輸入文本與候選標籤之間的關係，語義理解來進行分類


**已經定義好的情緒分析模型（如 TextBlob 或 VADER），不需要使用零樣本分類**


In [12]:
# 加載 BART 摘要生成模型的標記器
# 'facebook/bart-large-cnn' 模型
summary_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn', device=DEVICE)

# 加載並初始化 BART 摘要生成模型
# 生成條件摘要
summary_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(DEVICE)

# 加載並初始化 BART 的情感分類模型
sentiment_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=DEVICE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

**train_dataloader用於將訓練數據分成多個小批次**

訓練數據集有 1000 個樣本，每個批次大小為 32，則 train_dataloader 的長度約為 1000 / 32 = 31.25，這裡取整後即為 31 或 32。

**num_update_steps_per_epoch：每個週期中需要更新模型參數的步數**

**num_training_steps：表示總的訓練步數，即所有訓練週期中的總更新次數。**







**loss_fcn = CrossEntropyLoss()**
輸入的預測分布（logits 或 softmax probabilities）：

形狀：(𝑁,𝐶)

*   𝑁是批次大小。

*   𝐶是每個輸入樣本的類別數。


每個元素是模型對某一類別的預測值（logits）。這些值可以是未經 softmax 函數轉換的原始 logits，也可以是已經轉換過的概率值。

目標標籤（ground truth labels）：

形狀：(𝑁)每個元素是對應於輸入樣本的真實類別索引（長度為 𝑁，每個值是
0 到 𝐶−1 之間的整數）。

**num_train_epochs**

In [17]:
# 設定訓練的週期數，每個週期包括對整個訓練數據集的一次完整遍歷。
num_train_epochs = 2

# 計算每個週期的更新步數
num_update_steps_per_epoch = len(train_dataloader)
# 總的訓練步數
num_training_steps = num_train_epochs * num_update_steps_per_epoch

# 定義數據整理器
# **DataCollatorForSeq2Seq 是專門為序列到序列（Seq2Seq），包含標記化、填充
data_collator = DataCollatorForSeq2Seq(summary_tokenizer, model=summary_model)

# 定義優化器
# **summary_model.parameters()：將 summary_model 的所有參數傳遞給優化器
optimizer = AdamW(summary_model.parameters(), lr=2e-5)

# 損失函數
# 計算模型預測結果與實際標籤之間的差異
loss_fcn = CrossEntropyLoss()

**get_model_inputs 函數:將摘要文本轉換為模型的目標輸入格式**




*   tokenizer：用於標記化文本的 tokenizer，通常是預訓練模型的 tokenizer，例如 BART 或 BERT。
*   corpus：一個包含多個文本的列表，每個文本都需要進行標記化處理。
*   is_summaries：一個布爾值，指示是否處理的是摘要。如果是摘要，則使用不同的最大長度。


In [18]:
# 定義資料處理函數
def get_model_inputs(tokenizer, corpus, is_summaries=False):
    model_inputs = []  # 初始化模型輸入列表

    # 設定最大長度，摘要用 MAX_TARGET_LENGTH，其他用 MAX_INPUT_LENGTH
    # MAX_TARGET_LENGTH與MAX_INPUT_LENGTH在最前面定義了
    max_length = MAX_INPUT_LENGTH if not is_summaries else MAX_TARGET_LENGTH

    # 對每個文本進行標記
    for text in corpus:
        model_inputs.append(tokenizer(
            text,
            max_length=max_length,
            truncation=True,  # 超過最大長度時進行截斷
        ))

    return model_inputs  # 返回標記化的輸入數據

# labels 是經過 get_model_inputs 函數和 data_collator 函數處理後的輸出
# 包含了轉換為標記 ID 的摘要文本（input_ids）和相應的注意力掩碼（attention_mask）。
# 這些張量將被用作目標數據，傳遞給模型用於計算損失。

'''
像是
{
    'input_ids': tensor([[101, 2023, 2003, 1037, 7099,  ...], [101, 2129, 2079, 2017, 2424, ...]]),
    'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, ...], [1, 1, 1, 1, 1, 1, 1, ...]])
}

'''

"\n像是\n{\n    'input_ids': tensor([[101, 2023, 2003, 1037, 7099,  ...], [101, 2129, 2079, 2017, 2424, ...]]),\n    'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, ...], [1, 1, 1, 1, 1, 1, 1, ...]])\n}\n\n"

**get_sentiment_probs(sentiment_model, texts)**

情感分析模型 sentiment_model 對文本 texts 進行情感分類，獲取"**情感概率**"分數。

*   遍歷每個文本，使用情感分析模型對其進行分類。
*   獲取每個情感標籤的概率分數。

返回一個包含每個文本情感概率分數的張量。

**get_sentiment_labels(sentiment_model, texts)**

情感分析模型 sentiment_model 對文本 texts 進行情感分類，獲取"**情感標籤**"

*   遍歷每個文本，使用情感分析模型對其進行分類。
*   將分類結果映射為情感標籤。

返回一個包含每個文本情感標籤的張量。


**with torch.no_grad() 反向傳播（Backpropagation）是計算梯度並更新模型參數的過程。當我們只需要進行推理（inference）或評估模型而不需要進行訓練時，我們可以禁用梯度計算以節省內存和計算資源。**

torch 提供的功能和方法，包括定義張量、進行數學運算、自動微分等。

In [52]:
def get_sentiment_logits(sentiment_model, corpus):
    corpus_logits = []  # 存儲每個文本的情感 logits。

    with torch.no_grad():  # 不計算梯度(不需要進行反向傳播)，節省內存和計算資源
        for text in corpus:
            if text:  # 新增檢查空文本的邏輯
              sentiment = sentiment_model(text, return_all_scores=True)  # 獲取情感分析結果，返回所有分數。
              logits = [score['score'] for score in sentiment]  # 獲取每個情感標籤的分數作為 logits。
              corpus_logits.append(torch.tensor(logits))  # 將 logits 轉換為張量並添加到列表
            else:
                print(f"Empty text found in corpus")  # 新增警告信息
    return torch.stack(corpus_logits)  # 將所有 logits 張量堆疊成一個張量


'''
CrossEntropyLoss 的預期輸入（即 logits）
但 gen_summaries_sentiment_probs 經過 softmax 轉換為概率分數。
不符合CrossEntropyLoss 的預期輸入（即 logits）
'''
# 定義情感機率，返回每個情感標籤的概率分數。
def get_sentiment_probs(sentiment_model, corpus):
    corpus_scores = [] # 存儲每個文本的情感分數。

    with torch.no_grad():  # 不計算梯度(不需要進行反向傳播)，節省內存和計算資源
        for text in corpus:
            sentiment = sentiment_model(text, SENTIMENT_LABELS)  # 獲取情感分析結果，返回一個包含情感標籤和分數的字典。
            labels = sentiment['labels']
            scores = sentiment['scores']
            sorted_scores = [scores[labels.index(label)] for label in SENTIMENT_LABELS]  # 按情感標籤順序排列分數
            corpus_scores.append(torch.tensor(sorted_scores))  # 將分數轉換為張量並添加到列表

    return torch.stack(corpus_scores)  # 將所有分數張量堆疊成一個張量

# 將機率轉為情感標籤
def get_sentiment_labels(sentiment_model, corpus):
    corpus_scores = []

    with torch.no_grad():  # 禁用梯度計算，節省內存和計算資源
        for text in corpus:
            if text:  # 新增檢查空文本的邏輯
              sentiment = sentiment_model(text, SENTIMENT_LABELS)  # 使用情感模型進行情感分析，輸出結果包含各情感標籤的分數。
              label = sentiment['labels'][0]  # 獲取最高機率的情感標籤
              corpus_scores.append(SENTIMENT_LABELS_MAP[label])  # 將獲取到的情感標籤轉換為數字並添加到列表
            # SENTIMENT_LABELS_MAP 是一個字典，將情感標籤轉換為數字。
            else:
                print(f"Empty text found in corpus")  # 新增警告信息
    return torch.tensor(corpus_scores)  # 將所有標籤轉換為張量

**(自己)測試**

In [19]:
import torch
import torch.nn.functional as F

def labels_to_probs(labels, num_classes):
    """
    將標籤轉換為one-hot機率
    """
    one_hot = F.one_hot(labels, num_classes).float()
    return one_hot

def probs_to_logits(probs):
    """
    將機率轉換為logits
    """
    logits = torch.log(probs + 1e-9)  # 加上1e-9以避免log(0)
    return logits

In [20]:
def get_sentiment_probs(sentiment_model, corpus):
    corpus_scores = []
    with torch.no_grad():
        for text in corpus:
            if text:
                try:
                    sentiment = sentiment_model(text, SENTIMENT_LABELS)
                    if sentiment:
                        labels = sentiment['labels']
                        scores = sentiment['scores']
                        sorted_scores = [scores[labels.index(label)] for label in SENTIMENT_LABELS]
                        corpus_scores.append(torch.tensor(sorted_scores))
                    else:
                        print(f"Error: Sentiment model returned invalid result for text: {text}")
                        corpus_scores.append(torch.ones(len(SENTIMENT_LABELS)) / len(SENTIMENT_LABELS))  # 使用均等張量作為默认值
                except Exception as e:
                    print(f"Error processing text: {text}, error: {e}")
                    corpus_scores.append(torch.ones(len(SENTIMENT_LABELS)) / len(SENTIMENT_LABELS))  # 使用均等張量作為默认值
            else:
                print(f"Error: Empty text found in corpus")
                corpus_scores.append(torch.ones(len(SENTIMENT_LABELS)) / len(SENTIMENT_LABELS))  # 使用均等張量作為默认值
    return torch.stack(corpus_scores)

def get_sentiment_labels(sentiment_model, corpus):
    corpus_scores = []
    with torch.no_grad():
        for text in corpus:
            if text:
                try:
                    sentiment = sentiment_model(text, SENTIMENT_LABELS)
                    if sentiment and 'labels' in sentiment and len(sentiment['labels']) > 0:
                        label = sentiment['labels'][0]
                        corpus_scores.append(SENTIMENT_LABELS_MAP.get(label, -1))  # 使用 -1 作为未知标签
                    else:
                        print(f"Error: Sentiment model returned invalid result for text: {text}")
                        corpus_scores.append(-1)
                except Exception as e:
                    print(f"Error processing text: {text}, error: {e}")
                    corpus_scores.append(-1)
            else:
                print(f"Error: Empty text found in corpus")
                corpus_scores.append(-1)
    return torch.tensor(corpus_scores)


**生成文本摘要，並比較摘要與原始新聞的情感標籤是否一致，然後顯示一致的比例。**

**pbar = tqdm(test_dataloader)：初始化進度條，用於顯示測試過程中的進度。**


*   set_description 或 set_description_str 方法來設置進度條的描述信息

  pbar.set_description_str('[Epoch {}] Training'.format(epoch))
*   set_postfix_str 後綴信息來顯示一些動態數據，例如當前批次的損失和平均損失

  pbar.set_postfix_str('Batch Loss: {:.6f}, Average Loss: {:.6f}'.format(batch_loss, avg_loss))


*   自定義進度條的顯示格式，例如修改進度條的寬度、顏色和樣式

  tqdm(total=100, ncols=80, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')






**test_dataloader 是測試數據的加載器。**

.to(DEVICE)
          將 input_ids 張量移動到指定的設備上，以便利用 GPU 加速運算。如果有 GPU 可用就使用 GPU，否則使用 CPU。

          # 檢查是否有可用的 CUDA 設備（即 GPU），如果有，就將 DEVICE 設置為 "cuda"；如果沒有，就設置為 "cpu"。
          DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


****3. 標準化損失和動態調整權重****

In [21]:
def adjust_weights(avg_summary_loss, avg_sentiment_loss, min_step=0.1):
    if avg_summary_loss > avg_sentiment_loss:
        return max(0.5 + min_step, 0.9), min(0.5 - min_step, 0.1)
    else:
        return min(0.5 - min_step, 0.1), max(0.5 + min_step, 0.9)

def standardize_loss(loss, mean_loss):
    if mean_loss == 0:
        return loss
    standardized_loss = (loss - mean_loss) / (abs(mean_loss) + 1e-9)
    return standardized_loss


**檢查數據集中情感標籤的分布是否均衡。**

In [None]:
from collections import Counter
from tqdm import tqdm

def check_sentiment_distribution(corpus):
    sentiment_labels = get_sentiment_labels(sentiment_model, corpus)
    sentiment_counts = Counter(sentiment_labels.tolist())
    print("Sentiment distribution:")
    for label, count in sentiment_counts.items():
        print(f"{label}: {count}")
    return sentiment_counts

# 假設 train_dataloader 已經存在
all_news = []
for news, _ in tqdm(train_dataloader, desc="Collecting all news"):
    all_news.extend(news)

check_sentiment_distribution(all_news)


Collecting all news: 100%|██████████| 149/149 [00:00<00:00, 970.01it/s]


KeyboardInterrupt: 

**測試與評估模型在測試數據集上的性能。**

*   遍歷測試數據集，對每個批次進行處理：
    *   生成模型輸入。
    *   生成摘要並獲取情感標籤。
    *   打印生成的摘要和情感標籤。

評估模型在測試數據集上的生成效果。

**BART生成文本時，模型生成的 ID 序列是指模型在每個時間步生成的標記（token）的整數表示。這些 ID 序列是根據模型的詞彙表（vocabulary）進行編碼的，表示生成的每個標記在詞彙表中的索引。**


*   詞彙表可能包含以下映射：{"hello": 0, "world": 1,"\<s>": 2, "\</s>": 3, ...}。
*   "\<s> Deep learning is great \</s>"模型生成的 ID 序列可能是 [2, 451, 987, 123, 3]，其中每個整數對應於詞彙表中的一個標記。



**loss = summary_model(**).loss**

使用方法及參數可以看


*   官方文檔，例如 Hugging Face Transformers 文檔。

*   查閱模型的源碼，通常可以在模型類的 forward 方法中找到所需的參數。



In [65]:
def test_step():
    # eval() 是 torch.nn.Module 類的一個方法，用於將模型設置為評估模式。
    summary_model.eval()  # 設置模型為評估模式

    result = []

    # tqdm: 顯示迴圈的進度條
    # test_dataloader: DataLoader迭代測試數據集，能夠在小批次中加載數據
    pbar = tqdm(test_dataloader)  # 初始化進度條

    for index, (news, _) in enumerate(pbar): # 遍歷測試數據加載器中的每個批次數據。news 是新聞文本，忽略其對應的標籤 _。
        # 獲取輸入數據的ID並移動到設備
        '''
        *data_collator
          是一個函數或對象，用於將多個樣本整理成一個批次（batch）。
          在 NLP 任務中，這通常包括填充（padding）不同長度的序列以使它們具有相同的長度，以及創建注意掩碼。
          data_collator 返回的結果通常是一個字典，其中包括多個鍵，例如 input_ids、attention_mask 等。
        *['input_ids']
          模型的實際輸入數據，即每個詞語對應的 token IDs
        *.to(DEVICE)
          將 input_ids 張量移動到指定的設備上，以便利用 GPU 加速運算。
        '''
        summary_model_inputs = data_collator(get_model_inputs(summary_tokenizer, news))['input_ids'].to(DEVICE)
        # 生成摘要
        # generate 是 transformers 庫中預訓練模型的內建方法
        # 輸出是一個包含 token IDs 的張量或列表。這些 token IDs 是模型生成的摘要。
        summary_model_outputs = summary_model.generate(summary_model_inputs)

        # 解碼生成的摘要文本
        '''batch_decode 是 tokenizer 的一個方法，用於將一批 token IDs 解碼為對應的文本。
            skip_special_tokens=True 跳過特殊標記（special tokens），如 <s>, </s>, <pad> 等。
        '''
        summarized_text = summary_tokenizer.batch_decode(summary_model_outputs, skip_special_tokens=True)

        # 獲取新聞的情感標籤
        news_sentiment_label = get_sentiment_labels(sentiment_model, news)[0]
        # 獲取摘要的情感標籤
        summarized_sentiment_label = get_sentiment_labels(sentiment_model, summarized_text)[0]
        # 比較新聞和摘要的情感標籤是否一致
        # 新聞和摘要的情感如果一樣就是1，不一樣是0
        result.append(1 if news_sentiment_label == summarized_sentiment_label else 0)

        # 更新進度條後綴，顯示相同情感標籤的比例
        # 當index是test_dataloader的最後一個要素。
        if index == len(test_dataloader)-1:
            '''進度條的後綴中顯示 "Same Sentiment Rate"
               Same Sentiment Rate: result 中為 1 的比例，越高，代表生成摘要與原始新聞的情緒一致性越高
                result.count(1):計算 result 列表中值為 1 的元素數量
                代表"摘要與原始新聞的情感標籤"一致的情況
            '''
            pbar.set_postfix_str('Same Sentiment Rate: {:.3f}'.format(result.count(1) / len(result)))


**在每個訓練週期（epoch）中訓練模型**

*   生成模型輸入。
*   計算損失，包括生成摘要的損失和情感匹配損失。
*   反向傳播和參數更新。
*   記錄和打印損失。

主要作用是更新模型參數。

**計算損失並更新參數**


*   loss = summary_model(input_ids/attention_mask/labels).loss：計算訓練摘要模型的損失。
*   optimizer.zero_grad()：清空優化器的梯度。
*   loss.backward()：進行反向傳播，計算每個參數的梯度。
*   optimizer.step()：更新模型參數。




**可以試早停跟學習率與loss**

\# 自己定義EarlyStopping函數

early_stopping = EarlyStopping(patience=3, verbose=True)

\# 修改 scheduler

scheduler = get_linear_schedule_with_warmup(optimizer,
                      num_warmup_steps=total_steps * 0.1,  # 暖身步驟數
                      num_training_steps=total_steps,  # 總訓練步驟
                      last_epoch=-1)

In [None]:
def train_step(epoch):
    # 初始化列表來存儲每個批次的訓練損失
    total_training_loss = []

    # 將模型設置為訓練模式（啟用dropout等訓練專用層）
    # BartForConditionalGeneration 模型中啟用的層(Dropout、Layer Normalization、自注意力機制、Embeddings 層)
    summary_model.train()

    # pbar = tqdm(train_dataloader): tqdm 包裝 train_dataloader 來顯示訓練進度。
    # 設置進度條的描述，顯示當前的訓練週期(epoch 數)。
    pbar = tqdm(train_dataloader)
    pbar.set_description_str('[Epoch {}] Training'.format(epoch))

    # 訓練數據加載器中的每個批次
    for news, summaries in pbar:
        # 使用 summary_tokenizer 和 data_collator 處理新聞文本，生成模型的輸入數據。
        # 獲取訓練數據
        inputs = data_collator(get_model_inputs(summary_tokenizer, news))

        # 將輸入的 ID 和注意力掩碼移動到設備（例如 GPU）。
        # 這些張量包含了模型的輸入數據（input IDs 和 attention masks），移動到指定（CPU 或 GPU）上，以便後續的模型計算。
        X_input_ids = inputs['input_ids'].to(DEVICE)
        X_attention_mask = inputs['attention_mask'].to(DEVICE)

        # labels: 使用 summary_tokenizer 和 data_collator 處理資料集摘要，生成目標數據。
        # y_input_ids: 將目標輸入 ID 移動到設備。
        # 模型的真實摘要（target input IDs），移動到指定的設備上，以便在計算損失時使用。
        labels = data_collator(get_model_inputs(summary_tokenizer, summaries, is_summaries=True))
        y_input_ids = labels['input_ids'].to(DEVICE)


        # 計算損失並更新模型參數-> 模型生成的輸出與目標輸入 ID 計算損失
        # loss 變數:將輸入 ID、注意力掩碼和目標標籤傳遞給 summary_model (進行"前向傳播"計算)，衡量"生成的摘要與真實摘要"之間的差異
        # .loss：
          #   (在 Hugging Face 的 Transformers 庫中，當傳遞 labels 給模型時，會自動計算損失並包含在輸出中)
          #   基於目標標籤（labels）和模型預測（input_ids 和 attention_mask 的前向傳播結果）之間的交叉熵損失，差異計算出來的
        '''
        input_ids=X_input_ids：模型的輸入 ID。(將新聞轉換為模型可讀的 ID 後的結果)
        attention_mask=X_attention_mask：指示哪些位置是有效的（應該關注的），哪些是填充（padding）的
        labels=y_input_ids：模型的目標標籤(摘要文本的 ID)。(將目標摘要轉換為模型可讀的 ID 後的結果)
        '''
        loss = summary_model(input_ids=X_input_ids,
                             attention_mask=X_attention_mask,
                             labels=y_input_ids).loss


        # gen_summaries: 使用模型生成摘要文本，並解碼為可讀格式。
        '''
        summary_model.generate(X_input_ids)：這行程式碼使用模型來生成摘要
        summary_tokenizer.batch_decode：這個方法將模型生成的 ID 序列轉換回人類可讀的文本。
        '''
        gen_summaries = summary_tokenizer.batch_decode(summary_model.generate(X_input_ids), skip_special_tokens=True)  # 生成摘要

        # 獲取新聞的情感標籤
        '''
        get_sentiment_labels(sentiment_model, news)：
          使用情感分析模型 sentiment_model 對新聞文本 news 進行分類，獲取情感標籤(根據你定義的情感標籤映射 (SENTIMENT_LABELS_MAP))。
        .to(DEVICE)：將獲取的情感標籤張量移動到指定的設備上（DEVICE），這樣可以在訓練過程中使用 GPU 加速計算。
        '''
        news_sentiment_labels = get_sentiment_labels(sentiment_model, news).to(DEVICE)

        # 獲取摘要的情感概率
        '''
        get_sentiment_probs(sentiment_model, gen_summaries)：
          用情感分析模型 sentiment_model 對生成的摘要 gen_summaries 進行分類，獲取情感概率分數(各個情感標籤的概率。
        .to(DEVICE)：將獲取的情感概率分數張量移動到指定的設備上（DEVICE），這樣可以在訓練過程中使用 GPU 加速計算。
        '''
        '''get_sentiment_probs輸出是機率，不符CrossEntropyLoss 的預期輸入（即 logits）'''
        '''為了要符合 CrossEntropyLoss 的預期輸入（即 logits），get_sentiment_logits返回logits'''
        gen_summaries_sentiment_probs = get_sentiment_probs(sentiment_model, gen_summaries).to(DEVICE)
        # .clamp(min=1e-9) 是一個張量操作，將 gen_summaries_sentiment_probs 的值限制在至少 1×10−9的範圍內。防止概率值過小（接近於零），以避免在後續計算中出現數值不穩定或錯誤。
        gen_summaries_sentiment_probs = gen_summaries_sentiment_probs.clamp(min=1e-9)
        gen_summaries_sentiment_logits = torch.log(gen_summaries_sentiment_probs)

        # 加上情感匹配損失
        # 找到新聞情感的對應摘要機率，帶入損失函數做計算
        # 原本loss生成摘要與新聞的損失
        # loss_fcn基於"生成摘要的情感分布"與"原始新聞文本的情感標籤"之間的差異。
        '''輸入方式是機率，不符CrossEntropyLoss 的預期輸入（即 logits）
        loss = loss + loss_fcn(gen_summaries_sentiment_probs, news_sentiment_labels)'''
        loss = loss + loss_fcn(gen_summaries_sentiment_logits, news_sentiment_labels)

        '''
        每個訓練批次中梯度清除、反向傳播和模型參數更新，並且記錄和顯示訓練過程中的損失值。
        '''
        # 清除上一個批次的梯度為0
        optimizer.zero_grad()

        # 反向傳播計算梯度(鏈式法則)，基於損失值 loss 計算模型所有參數的梯度
        # 這些梯度將存儲在每個參數的 grad 屬性中。
        loss.backward()

        # 更新模型參數
        '''
        程式碼預設是Adam，通常有較好的收斂性能，並且對於不同問題的參數設定不太敏感
        '''
        optimizer.step()  # 更新參數

        # 將當前批次的損失值添加到總損失列表中
        # loss.item() 會將損失張量轉換為標量值，當前批次的損失值
        total_training_loss.append(loss.item())

        # 更新進度條後綴，顯示當前批次的損失和平均損失
        pbar.set_postfix_str('Batch Loss: {:.6f}, Average Loss: {:.6f}'.format(loss.item(), sum(total_training_loss) / len(total_training_loss)))



**有調整loss權重的訓練過程**

In [None]:
def train_step(epoch):
    global prev_summary_loss, prev_sentiment_loss
    total_training_loss = []
    total_summary_loss = []
    total_sentiment_loss = []

    summary_model.train()
    pbar = tqdm(train_dataloader)
    pbar.set_description_str('[Epoch {}] Training'.format(epoch))

    for news, summaries in pbar:
        inputs = data_collator(get_model_inputs(summary_tokenizer, news))
        X_input_ids = inputs['input_ids'].to(DEVICE)
        X_attention_mask = inputs['attention_mask'].to(DEVICE)

        labels = data_collator(get_model_inputs(summary_tokenizer, summaries, is_summaries=True))
        y_input_ids = labels['input_ids'].to(DEVICE)

        summary_loss = summary_model(input_ids=X_input_ids,
                                    attention_mask=X_attention_mask,
                                    labels=y_input_ids).loss

        gen_summaries = summary_tokenizer.batch_decode(summary_model.generate(X_input_ids), skip_special_tokens=True)

        if any(not gs for gs in gen_summaries):
            print(f"Empty generated summary found: {gen_summaries}")

        news_sentiment_labels = get_sentiment_labels(sentiment_model, news).to(DEVICE)
        gen_summaries_sentiment_probs = get_sentiment_probs(sentiment_model, gen_summaries).to(DEVICE)
        # .clamp(min=1e-9) 是一個張量操作，將 gen_summaries_sentiment_probs 的值限制在至少 1×10−9的範圍內。防止概率值過小（接近於零），以避免在後續計算中出現數值不穩定或錯誤。
        gen_summaries_sentiment_probs = gen_summaries_sentiment_probs.clamp(min=1e-9)
        gen_summaries_sentiment_logits = torch.log(gen_summaries_sentiment_probs)
        sentiment_loss = loss_fcn(gen_summaries_sentiment_logits, news_sentiment_labels)


        total_loss = summary_loss_weight * summary_loss + sentiment_loss_weight * sentiment_loss


        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        total_training_loss.append(total_loss.item())
        total_summary_loss.append(summary_loss.item())
        total_sentiment_loss.append(sentiment_loss.item())
        pbar.set_postfix_str('Batch Loss: {:.6f}, Average Loss: {:.6f}'.format(
            total_loss.item(), sum(total_training_loss) / len(total_training_loss)))

    # 計算摘要損失的平均值
    avg_summary_loss = sum(total_summary_loss) / len(total_summary_loss)
    # 計算情感損失的平均值
    avg_sentiment_loss = sum(total_sentiment_loss) / len(total_sentiment_loss)

**3. 標準化損失和動態調整權重**

In [None]:
def train_step(epoch, avg_summary_loss, avg_sentiment_loss):
    global prev_summary_loss, prev_sentiment_loss
    total_training_loss = []
    total_summary_loss = []
    total_sentiment_loss = []
    summary_model.train()
    pbar = tqdm(train_dataloader)
    pbar.set_description_str('[Epoch {}] Training'.format(epoch))

    for news, summaries in pbar:
        if news is None or summaries is None:
            print("Error: News or Summaries is None")
            continue
        inputs = data_collator(get_model_inputs(summary_tokenizer, news))
        X_input_ids = inputs['input_ids'].to(DEVICE)
        X_attention_mask = inputs['attention_mask'].to(DEVICE)
        labels = data_collator(get_model_inputs(summary_tokenizer, summaries, is_summaries=True))
        y_input_ids = labels['input_ids'].to(DEVICE)

        summary_loss = summary_model(input_ids=X_input_ids, attention_mask=X_attention_mask, labels=y_input_ids).loss
        gen_summaries = summary_tokenizer.batch_decode(summary_model.generate(X_input_ids), skip_special_tokens=True)
        if gen_summaries is None:
            print("Error: Generated summaries is None")
            continue

        news_sentiment_labels = get_sentiment_labels(sentiment_model, news).to(DEVICE)
        gen_summaries_sentiment_probs = get_sentiment_probs(sentiment_model, gen_summaries).to(DEVICE)
        sentiment_loss = loss_fcn(gen_summaries_sentiment_probs, news_sentiment_labels)

        # 標準化損失
        standardized_summary_loss = standardize_loss(summary_loss.item(), avg_summary_loss)
        standardized_sentiment_loss = standardize_loss(sentiment_loss.item(), avg_sentiment_loss)

        total_loss = summary_loss_weight * summary_loss + sentiment_loss_weight * sentiment_loss

        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        total_training_loss.append(total_loss.item())
        total_summary_loss.append(summary_loss.item())
        total_sentiment_loss.append(sentiment_loss.item())
        pbar.set_postfix_str('Batch Loss: {:.6f}, Average Loss: {:.6f}'.format(total_loss.item(), sum(total_training_loss) / len(total_training_loss)))

    avg_summary_loss = 0.9 * avg_summary_loss + 0.1 * (sum(total_summary_loss) / len(total_summary_loss))
    avg_sentiment_loss = 0.9 * avg_sentiment_loss + 0.1 * (sum(total_sentiment_loss) / len(total_sentiment_loss))
    return avg_summary_loss, avg_sentiment_loss


**4. 滑動平均能夠平滑損失值，減少波動，使得損失值變得更加穩定和可追踪。**

In [None]:
def train_step(epoch, avg_summary_loss, avg_sentiment_loss):
    global prev_summary_loss, prev_sentiment_loss, summary_loss_weight, sentiment_loss_weight
    total_training_loss = []
    total_summary_loss = []
    total_sentiment_loss = []
    same_sentiment_count = 0
    total_count = 0
    summary_model.train()
    pbar = tqdm(train_dataloader)
    pbar.set_description_str('[Epoch {}] Training'.format(epoch))

    for news, summaries in pbar:
        if news is None or summaries is None:
            print("Error: News or Summaries is None")
            continue
        inputs = data_collator(get_model_inputs(summary_tokenizer, news))
        if 'input_ids' not in inputs or 'attention_mask' not in inputs:
            print(f"Error: Invalid inputs from data_collator: {inputs}")
            continue
        X_input_ids = inputs['input_ids'].to(DEVICE)
        X_attention_mask = inputs['attention_mask'].to(DEVICE)

        labels = data_collator(get_model_inputs(summary_tokenizer, summaries, is_summaries=True))
        if 'input_ids' not in labels:
            print(f"Error: Invalid labels from data_collator: {labels}")
            continue
        y_input_ids = labels['input_ids'].to(DEVICE)

        summary_loss = summary_model(input_ids=X_input_ids, attention_mask=X_attention_mask, labels=y_input_ids).loss

        gen_summaries = summary_tokenizer.batch_decode(summary_model.generate(X_input_ids), skip_special_tokens=True)
        if not gen_summaries:
            print("Error: Generated summaries are empty")
            continue

        news_sentiment_labels = get_sentiment_labels(sentiment_model, news).to(DEVICE)
        news_sentiment_probs = labels_to_probs(news_sentiment_labels, len(SENTIMENT_LABELS)).to(DEVICE)
        news_sentiment_logits = probs_to_logits(news_sentiment_probs)

        gen_summaries_sentiment_probs = get_sentiment_probs(sentiment_model, gen_summaries).to(DEVICE)
        gen_summaries_sentiment_logits = probs_to_logits(gen_summaries_sentiment_probs).to(DEVICE)

        if news_sentiment_logits.size(0) == 0 or gen_summaries_sentiment_logits.size(0) == 0:
            print("Error: Sentiment labels or logits are empty")
            continue

        sentiment_loss = loss_fcn(gen_summaries_sentiment_logits, news_sentiment_labels)

        # 計算相同情感的比例
        same_sentiment_count += (torch.argmax(news_sentiment_logits, dim=1) == torch.argmax(gen_summaries_sentiment_logits, dim=1)).sum().item()
        total_count += news_sentiment_labels.size(0)

        # 標準化損失
        standardized_summary_loss = standardize_loss(summary_loss.item(), avg_summary_loss)
        standardized_sentiment_loss = standardize_loss(sentiment_loss.item(), avg_sentiment_loss)

        # 確保標準化后的損失是浮點數
        standardized_summary_loss = torch.tensor(float(standardized_summary_loss), dtype=torch.float32, requires_grad=True).to(DEVICE)
        standardized_sentiment_loss = torch.tensor(float(standardized_sentiment_loss), dtype=torch.float32, requires_grad=True).to(DEVICE)

        total_loss = summary_loss_weight * standardized_summary_loss + sentiment_loss_weight * standardized_sentiment_loss

        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        total_training_loss.append(total_loss.item())
        total_summary_loss.append(summary_loss.item())
        total_sentiment_loss.append(sentiment_loss.item())

        # 更新進度條的後綴信息，顯示當前批次損失和平均損失
        pbar.set_postfix_str('Batch Loss: {:.6f}, Avg Loss: {:.6f}'.format(total_loss.item(), sum(total_training_loss) / len(total_training_loss)))

    avg_summary_loss = 0.9 * avg_summary_loss + 0.1 * (sum(total_summary_loss) / len(total_summary_loss))
    avg_sentiment_loss = 0.9 * avg_sentiment_loss + 0.1 * (sum(total_sentiment_loss) / len(total_sentiment_loss))
    same_sentiment_rate = same_sentiment_count / total_count
    print(f"Same Sentiment Rate: {same_sentiment_rate:.3f}")
    return avg_summary_loss, avg_sentiment_loss, same_sentiment_rate

**5. 直接用logits算loss**

In [22]:
def train_step(epoch, avg_summary_loss, avg_sentiment_loss):
    global prev_summary_loss, prev_sentiment_loss, summary_loss_weight, sentiment_loss_weight
    total_training_loss = []
    total_summary_loss = []
    total_sentiment_loss = []
    same_sentiment_count = 0
    total_count = 0
    summary_model.train()
    pbar = tqdm(train_dataloader)
    pbar.set_description_str('[Epoch {}] Training'.format(epoch))

    for news, summaries in pbar:
        if news is None or summaries is None:
            print("Error: News or Summaries is None")
            continue
        inputs = data_collator(get_model_inputs(summary_tokenizer, news))
        if 'input_ids' not in inputs or 'attention_mask' not in inputs:
            print(f"Error: Invalid inputs from data_collator: {inputs}")
            continue
        X_input_ids = inputs['input_ids'].to(DEVICE)
        X_attention_mask = inputs['attention_mask'].to(DEVICE)

        labels = data_collator(get_model_inputs(summary_tokenizer, summaries, is_summaries=True))
        if 'input_ids' not in labels:
            print(f"Error: Invalid labels from data_collator: {labels}")
            continue
        y_input_ids = labels['input_ids'].to(DEVICE)

        summary_loss = summary_model(input_ids=X_input_ids, attention_mask=X_attention_mask, labels=y_input_ids).loss

        gen_summaries = summary_tokenizer.batch_decode(summary_model.generate(X_input_ids), skip_special_tokens=True)
        if not gen_summaries:
            print("Error: Generated summaries are empty")
            continue

        # 獲取新聞的情緒標籤索引
        news_sentiment_labels = get_sentiment_labels(sentiment_model, news).to(DEVICE)
        # 獲取生成的摘要的情緒logits
        gen_summaries_sentiment_probs = get_sentiment_probs(sentiment_model, gen_summaries).to(DEVICE)
        gen_summaries_sentiment_logits = torch.log(gen_summaries_sentiment_probs.clamp(min=1e-9))

        # 確保 logits 沒有問題
        if gen_summaries_sentiment_logits.size(0) == 0 or news_sentiment_labels.size(0) == 0:
            print("Error: Sentiment labels or logits are empty")
            continue

        # 使用 CrossEntropyLoss 計算情緒損失
        sentiment_loss = loss_fcn(gen_summaries_sentiment_logits, news_sentiment_labels)

        # 計算相同情感的比例
        same_sentiment_count += (torch.argmax(gen_summaries_sentiment_logits, dim=1) == news_sentiment_labels).sum().item()
        total_count += news_sentiment_labels.size(0)

        # 標準化損失
        standardized_summary_loss = standardize_loss(summary_loss.item(), avg_summary_loss)
        standardized_sentiment_loss = standardize_loss(sentiment_loss.item(), avg_sentiment_loss)

        # 確保標準化后的損失是浮點數
        standardized_summary_loss = torch.tensor(float(standardized_summary_loss), dtype=torch.float32, requires_grad=True).to(DEVICE)
        standardized_sentiment_loss = torch.tensor(float(standardized_sentiment_loss), dtype=torch.float32, requires_grad=True).to(DEVICE)

        total_loss = summary_loss_weight * standardized_summary_loss + sentiment_loss_weight * standardized_sentiment_loss

        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        total_training_loss.append(total_loss.item())
        total_summary_loss.append(summary_loss.item())
        total_sentiment_loss.append(sentiment_loss.item())

        # 更新進度條的後綴信息，顯示當前批次損失和平均損失
        pbar.set_postfix_str('Batch Loss: {:.6f}, Avg Loss: {:.6f}'.format(total_loss.item(), sum(total_training_loss) / len(total_training_loss)))

    avg_summary_loss = 0.9 * avg_summary_loss + 0.1 * (sum(total_summary_loss) / len(total_summary_loss))
    avg_sentiment_loss = 0.9 * avg_sentiment_loss + 0.1 * (sum(total_sentiment_loss) / len(total_sentiment_loss))
    same_sentiment_rate = same_sentiment_count / total_count
    print(f"Same Sentiment Rate: {same_sentiment_rate:.3f}")
    return avg_summary_loss, avg_sentiment_loss, same_sentiment_rate

**執行單個驗證步驟（epoch）。**

*   設置模型為評估模式。
*   遍歷驗證數據集，對每個批次進行處理：
    *   生成模型輸入。
    *   計算損失，包括生成摘要的損失和情感匹配損失。
    *   記錄和打印損失。

主要作用是評估模型在驗證數據集上的性能。

In [10]:
def eval_step(epoch):
    total_vaild_loss = []  # 初始化存儲總驗證損失的列表

    summary_model.eval()  # 設置模型為評估模式

    pbar = tqdm(eval_dataloader)  # 初始化進度條
    # 設置進度條的描述，顯示當前的驗證週期
    pbar.set_description_str('[Epoch {}] Validation'.format(epoch))

    # 驗證數據加載器中的每個批次
    for news, summaries in pbar:
        # 新聞文本轉換為模型輸入格式
        inputs = data_collator(get_model_inputs(summary_tokenizer, news))

        # 模型的輸入數據 ID 和注意力掩碼移動到設備（例如 GPU）。
        X_input_ids = inputs['input_ids'].to(DEVICE)
        X_attention_mask = inputs['attention_mask'].to(DEVICE)  # 移動注意力掩碼到設備

        # 摘要轉換為模型的目標輸入格式，並移動到設備上
        # labels 的輸出張量
          # input_ids：轉換後的摘要文本的標記 ID。
          # attention_mask：填充的位置，指示哪些位置是填充（0），哪些是有效標記（1）。
        labels = data_collator(get_model_inputs(summary_tokenizer, summaries, is_summaries=True))  # 獲取目標數據
        y_input_ids = labels['input_ids'].to(DEVICE)  # 移動目標輸入 ID 到設備

        # 計算"新聞文本與對應摘要"的損失函數。
        loss = summary_model(input_ids=X_input_ids,
                             attention_mask=X_attention_mask,
                             labels=y_input_ids).loss

        # 生成摘要
        gen_summaries = summary_tokenizer.batch_decode(summary_model.generate(X_input_ids), skip_special_tokens=True)

        # 獲取新聞的情感標籤
        news_sentiment_labels = get_sentiment_labels(sentiment_model, news).to(DEVICE)
        # 獲取摘要的情感概率

        '''get_sentiment_probs輸出是機率，不符CrossEntropyLoss 的預期輸入（即 logits）'''
        # gen_summaries_sentiment_probs = get_sentiment_probs(sentiment_model, gen_summaries).to(DEVICE)
        '''為了要符合 CrossEntropyLoss 的預期輸入（即 logits），get_sentiment_logits返回logits'''
        # gen_summaries_sentiment_logits = get_sentiment_logits(sentiment_model, gen_summaries).to(DEVICE)
        gen_summaries_sentiment_probs = get_sentiment_probs(sentiment_model, gen_summaries).to(DEVICE)
        # .clamp(min=1e-9) 是一個張量操作，將 gen_summaries_sentiment_probs 的值限制在至少 1×10−9的範圍內。防止概率值過小（接近於零），以避免在後續計算中出現數值不穩定或錯誤。
        gen_summaries_sentiment_probs = gen_summaries_sentiment_probs.clamp(min=1e-9)
        gen_summaries_sentiment_logits = torch.log(gen_summaries_sentiment_probs)

        # 加上情感匹配損失
        # 找到新聞情感的對應摘要機率，帶入損失函數做計算
        # 原本loss生成摘要與新聞的損失
        # loss_fcn基於"生成摘要的情感分布"與"原始新聞文本的情感標籤"之間的差異。
        '''輸入方式是機率，不符CrossEntropyLoss 的預期輸入（即 logits）
        loss = loss + loss_fcn(gen_summaries_sentiment_probs, news_sentiment_labels)'''
        loss = loss + loss_fcn(gen_summaries_sentiment_logits, news_sentiment_labels)

        total_vaild_loss.append(loss.item())  # 記錄損失

        # 更新進度條後綴，顯示當前批次的損失和平均損失
        pbar.set_postfix_str('Batch Loss: {:.6f}, Average Loss: {:.6f}'.format(loss.item(), sum(total_vaild_loss) / len(total_vaild_loss)))




**5.**

In [23]:
def eval_step(epoch):
    total_vaild_loss = []  # 初始化存儲總驗證損失的列表

    summary_model.eval()  # 設置模型為評估模式

    pbar = tqdm(eval_dataloader)  # 初始化進度條
    # 設置進度條的描述，顯示當前的驗證週期
    pbar.set_description_str('[Epoch {}] Validation'.format(epoch))

    # 驗證數據加載器中的每個批次
    for news, summaries in pbar:
        # 新聞文本轉換為模型輸入格式
        inputs = data_collator(get_model_inputs(summary_tokenizer, news))

        # 模型的輸入數據 ID 和注意力掩碼移動到設備（例如 GPU）。
        X_input_ids = inputs['input_ids'].to(DEVICE)
        X_attention_mask = inputs['attention_mask'].to(DEVICE)  # 移動注意力掩碼到設備

        # 摘要轉換為模型的目標輸入格式，並移動到設備上
        # labels 的輸出張量
          # input_ids：轉換後的摘要文本的標記 ID。
          # attention_mask：填充的位置，指示哪些位置是填充（0），哪些是有效標記（1）。
        labels = data_collator(get_model_inputs(summary_tokenizer, summaries, is_summaries=True))  # 獲取目標數據
        y_input_ids = labels['input_ids'].to(DEVICE)  # 移動目標輸入 ID 到設備

        # 計算"新聞文本與對應摘要"的損失函數。
        loss = summary_model(input_ids=X_input_ids,
                             attention_mask=X_attention_mask,
                             labels=y_input_ids).loss

        # 生成摘要
        gen_summaries = summary_tokenizer.batch_decode(summary_model.generate(X_input_ids), skip_special_tokens=True)

        # 獲取新聞的情感標籤索引
        news_sentiment_labels = get_sentiment_labels(sentiment_model, news).to(DEVICE)
        # 獲取摘要的情感logits
        gen_summaries_sentiment_probs = get_sentiment_probs(sentiment_model, gen_summaries).to(DEVICE)
        gen_summaries_sentiment_logits = torch.log(gen_summaries_sentiment_probs.clamp(min=1e-9))

        # 確保 logits 沒有問題
        if gen_summaries_sentiment_logits.size(0) == 0 or news_sentiment_labels.size(0) == 0:
            print("Error: Sentiment labels or logits are empty")
            continue

        # 使用 CrossEntropyLoss 計算情緒損失
        sentiment_loss = loss_fcn(gen_summaries_sentiment_logits, news_sentiment_labels)

        # 加上情感匹配損失
        loss = loss + sentiment_loss

        total_vaild_loss.append(loss.item())  # 記錄損失

        # 更新進度條後綴，顯示當前批次的損失和平均損失
        pbar.set_postfix_str('Batch Loss: {:.6f}, Average Loss: {:.6f}'.format(loss.item(), sum(total_vaild_loss) / len(total_vaild_loss)))

**從模型檔案名稱中提取 epoch 編號。**

返回提取出的 epoch 編號（整數）。

In [24]:
# 定義一個函數用來從檔案名中解析出模型訓練的 epoch 數字
def get_model_epoch_num(file_name):
    # 從檔案名分割出 epoch 數字，並轉換成整數型態返回
    '''
    model_epoch_10.pt
    file_name.split('_') ["model", "epoch", "10.pt"]
    file_name.split('_')[-1]  "10.pt"
    file_name.split('_')[-1][:-3]  "10"
    '''
    return int(file_name.split('_')[-1][:-3])

# 用來存儲最新的模型檢查點的字典
ckpt = None

# 檢查模型儲存路徑是否存在，不存在則創建
if not os.path.exists(MODEL_SAVE_PTH):
    os.mkdir(MODEL_SAVE_PTH)

# 檢查指定路徑下是否有檔案存在
if len(os.listdir(MODEL_SAVE_PTH)) != 0:
    latest = os.listdir(MODEL_SAVE_PTH)[0]  # 先預設第一個檔案為最新

    # 遍歷該路徑下的所有檔案
    for file in os.listdir(MODEL_SAVE_PTH):
        # 檢查檔案是否為 PyTorch 的模型檔案（以 'pt' 結尾），且檢查是否比已知的最新檔案新
        # file[-2:]:倒數第二個字符開始一直到字符串的結尾
        if file[-2:] == 'pt' and get_model_epoch_num(file) > get_model_epoch_num(latest):
            latest = file  # 更新最新檔案

    # 加載最新的模型檢查點
    # 模型保存路徑 MODEL_SAVE_PTH 和最新的檔案名 latest 組合成一個完整的文件路徑。
    '''
    latest 是最新的檢查點"檔案名稱"
    MODEL_SAVE_PTH 是保存"檔案的目錄"
    torch.load 函數將檢查點檔案加載到 ckpt 變數中，這是一個包含模型狀態和訓練進度的字典。
    '''
    ckpt = torch.load(os.path.join(MODEL_SAVE_PTH, latest))

start_epoch = 0  # 初始化訓練開始的 epoch 數

# 如果 ckpt 不是 None，即成功加載了模型檢查點
if ckpt != None:
    # 恢復訓練進度，可以從上次訓練結束的地方繼續訓練
    start_epoch = ckpt['epoch']  # 從檢查點讀取目前的 epoch 數
    # load_state_dict 將檢查點中的模型參數(權重、biases)，加載到 summary_model 模型中
    summary_model.load_state_dict(ckpt['model_state_dict'])


KeyboardInterrupt: 

**全部印出來**

In [25]:
# 這裡可以設定epoch要從哪裡開始
# 假設我之前訓練到第5個epoch，那我還要繼續10個range，我從這裏的start_epoch改掉就好
# start_epoch=5, range=10

# 定義一個函數來獲取模型檔案的訓練 epoch 編號
def get_model_epoch_num(file_name):
    # 假設檔案名稱格式為 "model_epoch_XX.pt"，提取最後的數字部分作為 epoch 編號
    return int(file_name.split('_')[-1][:-3])

# 初始化模型檢查點為 None
# 變數 ckpt 是用來存儲最新加載的模型檢查點的。模型檢查點通常包含了訓練過程中的關鍵狀態，
# 例如當前的 epoch 編號、模型的參數（狀態字典）和優化器的狀態等。
ckpt = None

# 如果模型保存路徑不存在，則創建該目錄
if not os.path.exists(MODEL_SAVE_PTH):
    os.mkdir(MODEL_SAVE_PTH)

# 如果模型保存路徑下有檔案
if len(os.listdir(MODEL_SAVE_PTH)) != 0:
    # 初始化 latest變數為_目錄中的第一個檔案
    latest = os.listdir(MODEL_SAVE_PTH)[0]
    print(latest) #state_dict_1.pt
    # 遍歷模型保存路徑中的所有檔案
    for file in os.listdir(MODEL_SAVE_PTH):
        # 檢查檔案是否為模型檔案（假設模型檔案的副檔名為 .pt）
        if file[-2:] == 'pt':
            # 更新 assign 變數為找到的 state_dict_5.pt 檔案
            if file == 'state_dict_1.pt':
                assign = file
            # 如果該檔案的 epoch 編號大於目前的最新檔案，則更新 latest
            if get_model_epoch_num(file) > get_model_epoch_num(latest):
                latest = file

    print(latest) #state_dict_5.pt
    print(assign)
    # 加載最新的模型檢查點
    # torch.save 和 torch.load 來保存和加載模型的參數狀態字典
    #ckpt = torch.load(os.path.join(MODEL_SAVE_PTH, latest))
    ckpt = torch.load(os.path.join(MODEL_SAVE_PTH, assign)) #我要從第5個epoch開始跑看看
    # print(ckpt)

# 初始化開始的 epoch 為 0
start_epoch = 0

# 如果找到了模型檢查點
if ckpt != None:
    # 更新start_epoch 為檢查點中保存的epoch
    start_epoch = ckpt['epoch']
    # 加載檢查點中保存的模型狀態字典
    summary_model.load_state_dict(ckpt['model_state_dict'])

print(start_epoch)

state_dict_1.pt
state_dict_2.pt
state_dict_1.pt
1


In [None]:
# 在訓練開始之前進行一次測試步驟，確定模型的初始性能
# 了解訓練資料集摘要與新聞之間的情緒關係
test_step()

# 循環遍歷每一個訓練周期（epoch）
# 從上次保存的 epoch 開始，進行訓練、驗證和測試。
# start_epoch+1 表示從下一個 epoch 開始
# start_epoch+num_train_epochs+1 確保總共訓練的 epoch 總數正確。
for epoch in range(start_epoch+1, start_epoch+num_train_epochs+1):
    train_step(epoch)  # 執行訓練步驟，傳入當前的 epoch 數，用於訓練模型
    eval_step(epoch)   # 執行評估步驟，通常用於在驗證集上評估模型的性能
    test_step()        # 執行測試步驟，用於在測試集上評估模型的性能

    # 保存當前 epoch 的模型檢查點
    torch.save({
        'epoch': epoch,  # 存儲當前的 epoch 數
        'model_state_dict': summary_model.state_dict()  # 存儲模型的權重和偏置
    }, os.path.join(MODEL_SAVE_PTH, f'state_dict_{epoch}.pt'))  # 檢查點保存的文件名及路徑


  6%|▌         | 4/67 [00:07<02:04,  1.97s/it]


KeyboardInterrupt: 

**有調整loss權重的**

In [21]:
test_step()

for epoch in range(start_epoch+1, start_epoch+num_train_epochs+1):
    summary_loss_weight, sentiment_loss_weight = adjust_weights(prev_summary_loss, prev_sentiment_loss)
    train_step(epoch)
    eval_step(epoch)
    test_step()
    torch.save({
        'epoch': epoch,
        'model_state_dict': summary_model.state_dict()
    }, os.path.join(MODEL_SAVE_PTH, f'state_dict_{epoch}.pt'))


  7%|▋         | 5/67 [00:08<01:34,  1.53s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
 16%|█▋        | 11/67 [00:18<01:36,  1.72s/it]


KeyboardInterrupt: 

**3. 4.標準化損失和動態調整權重**

In [None]:
def check_sentiment_distribution(corpus):
    sentiment_labels = []
    for text in tqdm(corpus, desc="Processing corpus for sentiment distribution"):
        labels = get_sentiment_labels(sentiment_model, [text])
        sentiment_labels.extend(labels.tolist())

    sentiment_counts = Counter(sentiment_labels)
    print("Sentiment distribution:")
    for label, count in sentiment_counts.items():
        print(f"{label}: {count}")
    return sentiment_counts

# 在訓練開始之前進行一次測試步驟，確定模型的初始性能
test_step()

# 檢查訓練集中的情感標籤分布
all_news = []
for news, _ in train_dataloader:
    all_news.extend(news)

check_sentiment_distribution(all_news)

# 循環遍歷每一個訓練周期（epoch）
for epoch in range(start_epoch+1, start_epoch+num_train_epochs+1):
    summary_loss_weight, sentiment_loss_weight = adjust_weights(avg_summary_loss, avg_sentiment_loss)
    avg_summary_loss, avg_sentiment_loss = train_step(epoch, avg_summary_loss, avg_sentiment_loss)
    eval_step(epoch)
    test_step()
    torch.save({
        'epoch': epoch,
        'model_state_dict': summary_model.state_dict()
    }, os.path.join(MODEL_SAVE_PTH, f'state_dict_{epoch}.pt'))

100%|██████████| 67/67 [01:37<00:00,  1.45s/it, Same Sentiment Rate: 0.328]
Processing corpus for sentiment distribution: 100%|██████████| 1192/1192 [08:31<00:00,  2.33it/s]


Sentiment distribution:
5: 403
2: 587
3: 65
0: 92
4: 23
1: 20
6: 2


[Epoch 1] Training:   0%|          | 0/149 [00:05<?, ?it/s]


TypeError: object of type 'NoneType' has no len()

In [None]:
for epoch in range(start_epoch+1, start_epoch+num_train_epochs+1):
    summary_loss_weight, sentiment_loss_weight = adjust_weights(avg_summary_loss, avg_sentiment_loss)
    avg_summary_loss, avg_sentiment_loss, same_sentiment_rate = train_step(epoch, avg_summary_loss, avg_sentiment_loss)
    eval_step(epoch)
    test_step()
    torch.save({
        'epoch': epoch,
        'model_state_dict': summary_model.state_dict()
    }, os.path.join(MODEL_SAVE_PTH, f'state_dict_{epoch}.pt'))

[Epoch 1] Training: 100%|██████████| 250/250 [29:16<00:00,  7.02s/it, Batch Loss: 1.573619, Avg Loss: 1.262937]


Same Sentiment Rate: 0.453


[Epoch 1] Validation: 100%|██████████| 13/13 [01:28<00:00,  6.84s/it, Batch Loss: 2.704040, Average Loss: 2.702139]
100%|██████████| 112/112 [02:37<00:00,  1.40s/it, Same Sentiment Rate: 0.482]
[Epoch 2] Training:  62%|██████▏   | 154/250 [17:49<10:49,  6.76s/it, Batch Loss: 9.476467, Avg Loss: 8.897231]

**6. 全部印出來**

In [28]:
# 最後一區有用到 __測試集

# 定義一個函數來在測試集上評估模型
def test_step():
    # 將模型設置為評估模式
    summary_model.eval()

    # 初始化一個列表來存儲測試結果
    result = []

    # 初始化一個列表來存儲測試結果
    result_new = []

    # 初始化進度條
    pbar = tqdm(test_dataloader)

    # 遍歷測試數據集中的每個批次
    for index, (news, _) in enumerate(pbar):
        # 使用 data_collator 函數和分詞器處理新聞文本，生成模型輸入
        summary_model_inputs = data_collator(get_model_inputs(summary_tokenizer, news))['input_ids'].to(DEVICE)
        # 使用模型生成摘要
        summary_model_outputs = summary_model.generate(summary_model_inputs)
        # 將生成的摘要轉換為文本
        summarized_text = summary_tokenizer.batch_decode(summary_model_outputs, skip_special_tokens=True)

        # 打印每個生成的摘要
        print(f'news Text {index + 1}: {news}')

        # 打印每個生成的摘要
        print(f'Summarized Text {index + 1}: {summarized_text}')

                # 將結果添加到結果列表中
        result_new.append({
            'News': news[0],
            'Summarized Text': summarized_text,

        })

        # 使用情緒分析模型獲取新聞文本的情緒標籤
        news_sentiment_label = get_sentiment_labels(sentiment_model, news)[0]
        # 使用情緒分析模型獲取生成摘要的情緒標籤
        summarized_sentiment_label = get_sentiment_labels(sentiment_model, summarized_text)[0]

        # 比較新聞文本和生成摘要的情緒標籤是否相同，並將結果添加到結果列表中
        # result是一個列表來存儲測試結果 如果摘要和原文標籤相同就append1，最後計算1總共的數量
        result.append(1 if news_sentiment_label == summarized_sentiment_label else 0)

        # 如果當前批次是最後一個，更新進度條的後綴信息，顯示相同情緒的比例
        # 所以是1越多越好
        # Same Sentiment Rate: 摘要和原文標籤相同/總共原文數量
        if index == len(test_dataloader) - 1:
            pbar.set_postfix_str('Same Sentiment Rate: {:.3f}'.format(result.count(1) / len(result)))
        # 創建 pandas DataFrame 並顯示表格
    results_df = pd.DataFrame(result_new)
    print(results_df)

    # 將結果匯出成 CSV 文件
    results_df.to_csv('results.csv', index=False)

In [None]:
test_step()

  0%|          | 0/112 [00:00<?, ?it/s]

news Text 1: ['UK house prices dip in November  UK house prices dipped slightly in November, the Office of the Deputy Prime Minister (ODPM) has said.  The average house price fell marginally to Â£180,226, from Â£180,444 in October. Recent evidence has suggested that the UK housing market is slowing after interest rate increases, and economists forecast a drop in prices during 2005. But while the monthly figures may hint at a cooling of the market, annual house price inflation is still strong, up 13.8% in the year to November. Economists, however, forecast that ODPM figures are likely to show a weakening in annual house price growth in coming months. "Overall, the housing market activity is slowing down and that is backed up by the mortgage lending and the mortgage approvals data," said Mark Miller, at HBOS Treasury Services. "The ODPM data is a fairly lagging indicator."  The figures come after the Bank of England said the number of mortgages approved in the UK has fallen to the lowest

  1%|          | 1/112 [00:01<02:42,  1.46s/it]

news Text 2: ['LSE \'sets date for takeover deal\'  The London Stock Exchange (LSE) is planning to announce a preferred takeover by the end of the month, newspaper reports claim.  The Sunday Telegraph said the LSE\'s plan was further evidence it wants to retain tight control over its destiny. Both Deutsche Boerse and rival Euronext held talks with the London market last week over a possible offer. A Â£1.3bn offer from Deutsche Boerse has already been rejected, while Euronext has said it will make an all cash bid. Speculation suggests that Paris-based Euronext has the facilities in place to make a bid of Â£1.4bn, while its German rival may up its bid to the Â£1.5bn mark. Neither has yet tabled a formal bid, but the LSE is expected to hold further talks with the two parties later this week. However, the Sunday Telegraph report added that there are signs that Deutsche Boerse chief executive Werner Seifert is becoming increasingly impatient with the LSE\'s managed bid process.  Despite ins

  2%|▏         | 2/112 [00:02<02:40,  1.46s/it]

news Text 3: ['Harinordoquy suffers France axe  Number eight Imanol Harinordoquy has been dropped from France\'s squad for the Six Nations match with Ireland in Dublin on 12 March.  Harinordoquy was a second-half replacement in last Saturday\'s 24-18 defeat to Wales. Bourgoin lock Pascal Pape, who has recovered from a sprained ankle, returns to the 22-man squad. Wing Cedric Heymans and Ludovic Valbon come in for Aurelien Rougerie and Jean-Philippe Grandclaude.  Rougerie hurt his chest against Wales while Grandclaude was a second-half replacement against both England and Wales. Valbon, capped in last June\'s Tests against the United States and Canada, was a second half replacement in the win over Scotland.  France coach Bernard Laporte said Harinordoquy had been axed after a poor display last weekend. "Imanol has been dropped from the squad because the least I can say is that he didn\'t make a thundering comeback against Wales," said Laporte. "We know the Ireland game will be fast and r

  3%|▎         | 3/112 [00:04<02:52,  1.58s/it]

news Text 4: ['Barclays shares up on merger talk  Shares in UK banking group Barclays have risen on Monday following a weekend press report that it had held merger talks with US bank Wells Fargo.  A tie-up between Barclays and California-based Wells Fargo would create the world\'s fourth biggest bank, valued at $180bn (Â£96bn). Barclays has declined to comment on the report in the Sunday Express, saying it does not respond to market speculation. The two banks reportedly held talks in October and November 2004.  Barclays shares were up 8 pence, or 1.3%, at 605 pence by late morning in London on Monday, making it the second biggest gainer in the FTSE 100 index. UK banking icon Barclays was founded more than 300 years ago; it has operations in over 60 countries and employs 76,200 staff worldwide. Its North American divisions focus on business banking, whereas Wells Fargo operates retail and business banking services from 6,000 branches. In 2003, Barclays reported a 20% rise in pre-tax pro

  4%|▎         | 4/112 [00:05<02:37,  1.46s/it]

news Text 5: ['Campaign \'cold calls\' questioned  Labour and the Conservatives are still telephoning the millions of people who have signed up to make sure they do not get marketing "cold calls".  The parties say they can stick to the rules by ensuring that their calls are not marketing - for instance by asking about people\'s voting intentions. The Lib Dems are asking the watchdog overseeing the rules to stop the calls. The information commissioner\'s office says surveys are allowed but people had to be told if personal data was kept. Telephone call centres are expected to be used as never before by all the three major parties in the run-up to the general election.  But seven million telephone numbers are on the Telephone Preference Service (TPS) lists, which ban unsolicited sales and marketing calls. Both schemes are run by the Direct Marketing Association and backed by EU directives on privacy and electronic communications.  The rules on marketing calls apply as much to politicians

  4%|▍         | 5/112 [00:07<02:37,  1.48s/it]

news Text 6: ['Wolves appoint Hoddle as manager  Glenn Hoddle has been unveiled as the new Wolves manager.  The ex-England coach has been given a six-month contract to succeed Dave Jones, who was sacked after the club\'s poor start to the season. Wolves chairman Rick Hayward said: "We\'re delighted Glenn is here. He has a six-month contract so we can test each other out and see if it works." Hoddle, who will work alongside Stuart Gray, has been out of the game since he was sacked by Spurs in 2003. Gray, who has been caretaker manager, was assistant boss when Hoddle was manager at Southampton. "I\'m delighted to be here," said Hoddle.  "I saw the massive potential that Wolves have got and their desire and amibition to get back into the Premiership parallels my ambitions. "Stuart Gray has done a fantastic job as caretaker manager. We\'ve worked together at Southampton and I\'m delighted to be back with him." Wolves chief executive Jez Moxey defended the decision to give Hoddle a short-te

  5%|▌         | 6/112 [00:08<02:38,  1.50s/it]

news Text 7: ['Hantuchova in Dubai last eight  Daniela Hantuchova moved into the quarter-finals of the Dubai Open, after beating Elene Likhotseva of Russia 7-5 6-4, and now faces Serena Williams.  Australian Open champion Williams survived an early scare to beat Russia\'s Elena Bovina 1-6 6-1 6-4. World number one Lindsay Davenport and Anastasia Myskina also progressed. Davenport defeated China\'s Jie Zheng 6-2 7-5, while French Open champion Myskina sailed through after her opponent Marion Bartoli retired hurt. American Davenport will now face fellow former Wimbledon champion, Conchita Martinez of Spain, who ousted seventh-seeded Nathalie Dechy of France 6-1 6-2. Myskina will face eighth-seed Patty Schnyder from Switzerland, who defeated China\'s Li Na 6-3 7-6 (10-8). The other quarter final pits wild card Sania Mirza of India against Jelena Jankovic of Serbia and Montenegro, who both won on Tuesday.  Before her meeting with Martinez, Davenport believes there is some room for improvem

  6%|▋         | 7/112 [00:10<02:38,  1.51s/it]

news Text 8: ['BAA support ahead of court battle  UK airport operator BAA has reiterated its support for the government\'s aviation expansion plans to airports throughout the country.  The comments come a day ahead of a High Court challenge by residents\' groups and local councils to the government\'s White Paper. The judicial review will centre on government plans for expansion at Heathrow, Stansted and Luton airports. BAA, which operates all three, said it was consulting with local communities. "We are...consulting on voluntary compensation schemes which go beyond our statutory obligations," a BAA spokesman said.  Groups challenging the plans include Stop Stansted Expansion, Heathrow anti-noise campaigners HACAN Clearskies and the London boroughs of Hillingdon and Wandsworth. At Heathrow, Gatwick, Edinburgh and Glasgow airports, BAA launched a series of consultations on blight to properties from the proposed expansion in September 2004, which will close next week. The company is also

  7%|▋         | 8/112 [00:11<02:31,  1.46s/it]

news Text 9: ['\'My memories of Marley...\'  To mark the 60th anniversary of the birth of reggae star Bob Marley, Rob Partridge - Marley\'s former head of press at Island Records - remembers the man behind the legend.  Partridge worked with Marley from 1977 until the Jamaican musician\'s death in 1981.  : "I joined Island Records in 1977 and the first week I was there I worked on his show at the Rainbow Theatre. It was one of the last dates he did in London."  : The album Exodus came out in 1977 and that provided five hits and confirmed his global superstar status. "By 1979 he was the biggest touring attraction in the world. I remember going to see dates in Milan and Turin and they were enormous concerts."  : Bob was one of the most mesmeric people I\'ve ever had the privilege to work with. "He must have had an iron will to succeed. Bob was a very driven individual. You realised from the start there was a manifest destiny within him that he believed in. He didn\'t suffer fools gladly. 

  8%|▊         | 9/112 [00:13<02:40,  1.56s/it]

news Text 10: ['Labour trio \'had vote-rig factory\'  Three Labour councillors in Birmingham were caught operating a "vote-rigging factory", an Election Court has heard.  Police found the trio handling unsealed postal ballots in a deserted warehouse in the city during a late-night raid in June 2004, the hearing was told. The votes were later counted towards that month\'s English local elections. The men, elected to the Aston ward, deny collecting votes fraudulently. The judge presiding has indicated the whole postal voting system is under scrutiny. Deputy High Court Judge Richard Mawrey, QC told the hearing at the Birmingham and Midlands Institute the case could have potentially serious consequences for any forthcoming General Election.  The special Election Court, the first in living memory to hear allegations of vote-rigging, opened in Birmingham last month. The case against Muhammad Afzal, Mohammed Islam and Mohammed Kazi is being brought by local Liberal Democrat supporters. They c

  9%|▉         | 10/112 [00:15<02:33,  1.51s/it]

news Text 11: ['US to rule on Yukos refuge call  Yukos has said a US bankruptcy court will decide whether to block Russia\'s impending auction of its main production arm on Thursday.  The Russian oil firm has filed for bankruptcy protection in the US in an attempt to halt the forced sale. However, Judge Letitia Clark said the hearing would continue on Thursday when arguments in the case would be heard. Russian authorities are due to auction off Yuganskneftegas on 19 December to pay a huge tax bill sent to Yukos.  Russian prosecutors are forcing the sale of the firm\'s most lucrative asset Yuganskneftegas to help pay a $27bn (Â£14bn) back tax bill, which they claim is owed by Yukos.  Filing for bankruptcy protection in the US was "a last resort to preserve the rights of our shareholders, employees and customers," said Yukos chief executive Steven Theede. The company added it had opted to take action through American courts as US bankruptcy law gives worldwide jurisdiction over a debtor 

 10%|▉         | 11/112 [00:16<02:39,  1.58s/it]

news Text 12: ["Ray DVD beats box office takings  Oscar-nominated film biopic Ray has surpassed its US box office takings with a combined tally of $80m (Â£43m) from DVD and video sales and rentals.  Ray's success on DVD outstripped its $74m (Â£40m) US box office total, earning more than $40m (Â£22m) on the first day of the DVD's release alone. Ray has been nominated in six Oscar categories including best film and best actor for Jamie Foxx. The film recounts the life of blues singer Ray Charles, who died in 2004. In its first week on home entertainment release the film was the number one selling DVD, with the limited edition version coming in at number 11. Sony horror film The Grudge, starring Michelle Gellar, was the US' second best-selling DVD, with Jennifer Lopez and Richard Gere's romantic comedy Shall We Dance? at number three. Foxx's critically acclaimed performance as Ray has already earned him a Screen Actors Guild Award for best actor, as well as a prestigious Golden Globe. Ray

 11%|█         | 12/112 [00:17<02:25,  1.46s/it]

news Text 13: ['Adriano\'s Chelsea link rejected  Adriano\'s agent Gilmar Rinaldi has insisted that he has had no contact with Chelsea over the striker.  Chelsea were reported to have made inquiries about Inter Milan\'s 22-year-old Brazilian star. Rinaldi told BBC Sport from Rio de Janeiro: "I can assure you that Chelsea have had no dealings whatsoever with either me or Adriano. "Parma and Real Madrid are interested but there\'s nothing new there. Their interest has been known for some time." Adriano has scored 14 goals in 20 Serie A appearances this season. And Chelsea boss Jose Mourinho had claimed that he was in Milan talking to Adriano on the day he is alleged to have held a clandestine meeting with Arsenal defender Ashley Cole. Mourinho said he was "just practising my Portuguese with him because I don\'t need strikers". Rinaldi told BBC Sport: "I have to say that nobody from Chelsea or any other London club has contacted me. "If they want to, that\'s fine. I can tell them what the

 12%|█▏        | 13/112 [00:19<02:16,  1.38s/it]

news Text 14: ['Weak end-of-year sales hit Next  Next has said its annual profit will be Â£5m lower than previously expected because its end-of-year clearance sale has proved disappointing.  "Clearance rates in our end-of-season sale have been below our expectations," the company said. The High Street retailer said it now expected to report annual profits of between Â£415m and Â£425m ($779m-798m). Next\'s shares fell more than 3% following the release of the trading statement.  Next chief executive Simon Wolfson admitted that festive sales were "below where we would expect a normal Christmas to be", but said sales should still top analyst expectations.  Among areas where Next could have done better, Mr Wolfson said menswear ranges were "a little bit too similar to the previous year". Mr Wolfson also said that disappointing pre-Christmas sales were "more to do with the fact that we went in with too much stock rather than (the fact that) demand wasn\'t there for the stock". Next\'s like-

 12%|█▎        | 14/112 [00:20<02:16,  1.39s/it]

news Text 15: ["Howl helps boost Japan's cinemas  Japan's box office received a 3.8% boost last year, with ticket sales worth 211bn yen (Â£1.08bn).  The surge was led by animated movie Howl's Moving Castle, which took 20bn yen (Â£102m) to become the biggest film in Japan in 2004. It is expected to match the 30.7bn yen (Â£157m) record of Hayao Miyazaki's previous film Spirited Away. Japan Motion Picture Producers figures showed that 170 million cinema admissions were made in Japan in 2004. The Last Samurai, starring Tom Cruise, was the biggest foreign movie hit in Japan last year, taking 13.8bn yen (Â£70.7m).  It was followed by Harry Potter and the Prisoner of Azkaban, Finding Nemo and The Lord of the Rings: The Return of the King. The second highest-grossing Japanese film was romantic drama Crying Out Love in the Centre of the World, followed by Be With You and Pocket Monsters Advanced Generation. Japanese films accounted for 37.5% of Japan's box office total last year, with foreign f

 13%|█▎        | 15/112 [00:21<02:09,  1.34s/it]

news Text 16: ["Ailing EuroDisney vows turnaround  EuroDisney, the European home of Mickey Mouse and friends, has said it will sell 253m euros (Â£175m; $328m) of new shares as it looks to avoid insolvency.  The sale is the last part of a plan to restructure 2.4bn euros-worth of debts. Despite struggling since it was opened in 1992, EuroDisney has recently made progress in turning its business around and ticket sales have picked up. However, analysts still question whether it attracts enough visitors to stay open, even with the restructuring.  EuroDisney remains Europe's largest single tourist attraction, attracting some 12.4 million visitors annually. A new attraction - Walt Disney Studios - has recently opened its site near Paris. The company's currently traded stock tumbled in Paris on the latest news, shedding 15% to 22 euro cents. EuroDisney will sell the new shares priced at 9 euros cents each. The US Disney Corporation and Saudi Arabian prince Al-Walid bin Talal, the firm's two m

 14%|█▍        | 16/112 [00:23<02:07,  1.33s/it]

news Text 17: ['Microsoft launches its own search  Microsoft has unveiled the finished version of its home-grown search engine.  The now formally launched MSN search site takes the training wheels off the test version unveiled in November 2003. The revamped engine indexes more pages than before, can give direct answers to factual questions, and features tools to help people create detailed queries. Microsoft faces challenges establishing itself as a serious search site because of the intense competition for queries.  Google still reigns supreme as the site people turn to most often when they go online to answer a query, keep up with news or search for images. But in the last year Google has faced greater competition than ever for users as old rivals, such as Yahoo and Microsoft, and new entrants such as Amazon and Blinkx, try to grab some of the searching audience for themselves. This renewed interest has come about because of the realisation that many of the things people do online be

 15%|█▌        | 17/112 [00:24<02:09,  1.36s/it]

news Text 18: ['Rock group Korn\'s guitarist quits  The guitarist with US rock band Korn has quit the music business, saying he made the decision after experiencing a religious awakening.  Brian \'Head\' Welch told a radio station in California that his bandmates respected his decision to leave. A replacement guitarist has yet to be named by Korn, who are currently at work on their eighth studio album. Welch added that he would appear at a church in Bakersfield to explain how he "got to this place in life". The remaining members of Korn, who are known for their hardcore brand of rock, said they hoped Welch "finds the happiness he is looking for".  The 34-year-old made reference to the band\'s aggressive brand of music and its young fans in his parting statement. "Anger is a good thing, and if kids want to listen to Korn, good, but there\'s happiness after the anger," he told his local radio station in Bakersfield. "I\'m going to show it through my actions, how much I love my fans," add

 16%|█▌        | 18/112 [00:25<02:05,  1.34s/it]

news Text 19: ["Troubled Marsh under SEC scrutiny  The US stock market regulator is investigating troubled insurance broker Marsh & McLennan's shareholder transactions, the firm has said.  The Securities and Exchange Commission has asked for information about transactions involving holders of 5% or more of the firm's shares. Marsh has said it is co-operating fully with the SEC investigation. Marsh is also the focus of an inquiry the New York attorney-general into whether insurers rigged the market. Since that inquiry was launched in October, Marsh has replaced its chief executive and held a boardroom shake-out to meet criticism by lessening the number of company executives on the board. Prosecutors allege that Marsh - the world's biggest insurance broker - and other US insurance firms may have fixed bids for corporate cover. This is the issue at the heart of the inquiry by New York's top law officer, Eliot Spitzer, and a separate prosecution of five insurers by the State of California.

 17%|█▋        | 19/112 [00:27<02:03,  1.33s/it]

news Text 20: ['Campaigners attack MTV \'sleaze\'  MTV has been criticised for "incessant sleaze" by television indecency campaigners in the US.  The Parents Television Council (PTC), which monitors violence and sex on TV, said the cable music channel offered the "cheapest form" of programming. The group is at the forefront of a vociferous campaign to clean up American television. But a spokeswoman for MTV said it was "unfair and inaccurate" to single out MTV for criticism.  The PTC monitored MTV\'s output for 171 hours from 20 March to 27 March 2004, during the channel\'s Spring Break coverage. In its report - MTV Smut Peddlers: Targeting Kids with Sex, Drugs and Alcohol - the PTC said it witnessed 3,056 flashes of nudity or sexual situations and 2,881 verbal references to sex. Brent Bozell, PTC president and conservative activist said: "MTV is blatantly selling raunchy sex to kids. "Compared to broadcast television programmes aimed at adults, MTV\'s programming contains substantially

 18%|█▊        | 20/112 [00:28<02:02,  1.33s/it]

news Text 21: ['Australia rates at four year high  Australia is raising its benchmark interest rate to its highest level in four years despite signs of a slowdown in the country\'s economy.  The Reserve Bank of Australia lifted interest rates 0.25% to 5.5%, their first upwards move in more than a year. However, shortly after the Bank made its decision, new figures showed a fall in economic growth in the last quarter. The Bank said it had acted to curb inflation but the move was criticised by some analysts.  The rate hike was the first since December 2003 and had been well-flagged in advance. However, opposition parties and some analysts said the move was ill-timed given data showing the Australian economy grew just 0.1% between October and December and 1.5% on an annual basis.  The figures, representing a decline from the 0.2% growth in GDP seen between July and September, were below market expectations. Consumer spending remains strong, however, and the Bank is concerned about growing

 19%|█▉        | 21/112 [00:29<02:06,  1.39s/it]

news Text 22: ['Almagro continues Spanish surge  Unseeded Nicolas Almagro became the fifth Spaniard to reach the last eight at the Buenos Aires Open, ousting eighth seed Mariano Zabaleta.  He showed admirable resolve to win a rain-affected match 6-7 6-4 6-4. Compatriot and seventh seed Rafael Nadal also reached the last eight, beating Italian Potito Starace 6-1 6-3. Nadal, playing in the outdoor clay event for the first time, hit some powerful forehands to oust Starace in a match delayed over an hour by rain. "It\'s always a problem to have to stop for rain but one gets used to it," said Spanish teenager Nadal. "Luckily, I was able to keep my pace going throughout the match." He will now play Gaston Gaudio, who beat unseeded Brazilian Flavio Saretta 6-3 6-2 in the day\'s late match.']
Summarized Text 22: ['Unseeded Nicolas Almagro becomes the fifth Spaniard to reach the last eight at the Buenos Aires Open. Compatriot and seventh seed Rafael Nadal also reached the lastEight, beating Ita

 20%|█▉        | 22/112 [00:31<02:04,  1.38s/it]

news Text 23: ['India power shares jump on debut  Shares in India\'s largest power producer, National Thermal Power Corp (NTPC) have risen 13% on their stock market debut.  The government\'s partial sell-off of NTPC is part of a controversial programme to privatise state-run firms. The 865 million share offer, a mix of new shares and sales by the government, raised 54bn rupees($1.2bn). It was India\'s second $1bn stock debut in three months, coming after the flotation by software firm Tata. The share offer was eleven times oversubscribed. "It is a good investment bet," said Suhas Naik, an investment analyst from ING Mutual Fund. "Power needs in India are set to rise and NTPC will benefit from that." Analysts say the success of the NTPC flotation would encourage the government to reduce stakes in more power companies. NTPC has said it will use the money from the share sale to feed the growing needs of the country\'s energy-starved economy. The firm is the largest utility company in Indi

 21%|██        | 23/112 [00:32<01:56,  1.31s/it]

news Text 24: ['Stars pay tribute to actor Davis  Hollywood stars including Spike Lee, Burt Reynolds and Oscar nominee Alan Alda have paid tribute to actor Ossie Davis at a funeral in New York.  Veteran star Ossie Davis, a well-known civil rights activist, died in Miami at the age of 87 on 4 February 2005. Friends and family, including actress Ruby Dee his wife of 56 years, gathered at the Riverside Church on Saturday. Also present at the service was former US president Bill Clinton and singer Harry Belafonte, who gave the eulogy. "He would have been a very good president of the United States," said Mr Clinton. "Like most of you here, he gave more to me than I gave to him."  The 87-year-old was found dead last weekend in his hotel room in Florida, where he was making a film. Police said that he appeared to have died of natural causes. Davis made his acting debut in 1950 in No Way Out starring Sidney Poiter. He frequently collaborated with director Spike Lee, starring in seven Lee films

 21%|██▏       | 24/112 [00:33<02:00,  1.37s/it]

news Text 25: ["Buyers snap up Jet Airways' shares  Investors have snapped up shares in Jet Airways, India's biggest airline, following the launch of its much anticipated initial public offer (IPO).  The IPO for 17.3 million shares was fully sold within 10 minutes of opening, on Friday. Analysts expect Jet to raise at least 16.4bn rupees ($375m; Â£198m) from the offering. Interest in Jet's IPO has been fuelled by hopes for robust growth in India's air travel market.  The share offer, representing about 20% of Jet's equity, was oversubscribed, news agency Reuters reported. Jet, which was founded by London-based travel agent Naresh Goyal, plans to use the cash to buy new planes and cut its debt. The company has grown rapidly since it launched operations in 1993, overtaking state-owned flag carrier Indian Airlines. However, it faces stiff competition from rivals and low-cost carriers. Jet's IPO is the first in a series of expected share offers from Indian companies this year, as they move

 22%|██▏       | 25/112 [00:35<01:57,  1.35s/it]

news Text 26: ['Dibaba breaks 5,000m world record  Ethiopia\'s Tirunesh Dibaba set a new world record in winning the women\'s 5,000m at the Boston Indoor Games.  Dibaba won in 14 minutes 32.93 seconds to erase the previous world indoor mark of 14:39.29 set by another Ethiopian, Berhane Adera, in Stuttgart last year. But compatriot Kenenisa Bekele\'s record hopes were dashed when he miscounted his laps in the men\'s 3,000m and staged his sprint finish a lap too soon. Ireland\'s Alistair Cragg won in 7:39.89 as Bekele battled to second in 7:41.42. "I didn\'t want to sit back and get out-kicked," said Cragg. "So I kept on the pace. The plan was to go with 500m to go no matter what, but when Bekele made the mistake that was it. The race was mine." Sweden\'s Carolina Kluft, the Olympic heptathlon champion, and Slovenia\'s Jolanda Ceplak had winning performances, too. Kluft took the long jump at 6.63m, while Ceplak easily won the women\'s 800m in 2:01.52.']
Summarized Text 26: ["Tirunesh Dib

 23%|██▎       | 26/112 [00:36<01:54,  1.33s/it]

news Text 27: ['Glasgow hosts tsunami benefit gig  The top names in Scottish music are taking part in a benefit concert in aid of the victims of the Asian tsunami.  All 10,000 tickets for Saturday\'s concert, featuring Franz Ferdinand, Belle and Sebastian and Travis, at Glasgow\'s SECC sold out in 36 hours. Mull Historical Society, Deacon Blue, Idlewild, Texas, Mogwai and Teenage Fanclub are among the other acts performing at the concert. Organisers hope to raise at least Â£250,000 from the show.  It follows a Cardiff gig starring Eric Clapton, Keane and Jools Holland, which raised more than Â£1.25m. And it is taking place on the same night as a tsunami benefit show in Bristol, which will see Massive Attack and Portishead share a stage for the first time. Colin MacIntyre, of Mull Historical Society, was playing another gig on the same day but said he was determined to make the Glasgow benefit. He said: "I think we were all affected by seeing the reports coming from the Far East. "We al

 24%|██▍       | 27/112 [00:37<01:49,  1.29s/it]

news Text 28: ['Hewitt fights back to reach final  Lleyton Hewitt kept his dream of an Australian Open title alive with a four-set win over Andy Roddick in Friday\'s second semi-final.  The home favourite will face Marat Safin in Sunday\'s final after coming through 3-6 7-6 (7-3) 7-6 (7-4) 6-1. Hewitt fought back from a set down and trailed in both tie-breaks but would not be denied, thrilling the Melbourne crowd with a typically battling effort. He is aiming to be the first Australian winner since Mark Edmondson in 1976. Hewitt is the first Australian to make the final since Pat Cash lost to Mats Wilander in 1988, but faces a huge challenge against Safin - the conqueror of Roger Federer. After needing five sets in his last two matches there was reason to think Hewitt might struggle for fitness. He certainly made a sluggish start, dropping his opening service game, and Roddick dominated with his huge serve as he took the first set.  After 12 tense games in the second, the key moment ca

 25%|██▌       | 28/112 [00:39<01:52,  1.34s/it]

news Text 29: ['Crucial decision on super-casinos  A decision on whether to allow Westminster to legislate on super-casinos is set to be made by the Scottish Parliament.  The government has plans for up to eight Las Vegas style resorts in the UK, one of which is likely to be in Glasgow. Scottish ministers insist they will still have the final say on whether a super-casino will be built in Scotland. But opposition parties say that will not happen in practice. The vote is due to be taken on Wednesday and is expected to be close.  The Scottish Executive believes that the legislation should be handled by Westminster. The new law will control internet gambling for the first time and is aimed at preventing children from becoming involved. A super-casino in Glasgow could be located at Ibrox or the Scottish Exhibition and Conference Centre. The new gambling bill going through Westminster will allow casino complexes to open to the public, have live entertainment and large numbers of fruit machi

 26%|██▌       | 29/112 [00:40<01:48,  1.31s/it]

news Text 30: ['Tigers wary of Farrell \'gamble\'  Leicester say they will not be rushed into making a bid for Andy Farrell should the Great Britain rugby league captain decide to switch codes.  "We and anybody else involved in the process are still some way away from going to the next stage," Tigers boss John Wells told BBC Radio Leicester. "At the moment, there are still a lot of unknowns about Andy Farrell, not least his medical situation. "Whoever does take him on is going to take a big, big gamble." Farrell, who has had persistent knee problems, had an operation on his knee five weeks ago and is expected to be out for another three months. Leicester and Saracens are believed to head the list of rugby union clubs interested in signing Farrell if he decides to move to the 15-man game.  If he does move across to union, Wells believes he would better off playing in the backs, at least initially. "I\'m sure he could make the step between league and union by being involved in the centre

 27%|██▋       | 30/112 [00:41<01:44,  1.28s/it]

news Text 31: ['Redknapp poised for Saints  Southampton are set to unveil Harry Redknapp as their new manager at a news conference at 1500 GMT on Wednesday.  The former Portsmouth boss replaces Steve Wigley, who has been relieved of first-team duties after just one win in 14 league games in charge. Redknapp, 57, quit his Fratton Park position on 24 November and vowed: "I will not go down the road - no chance." Pompey coach Kevin Bond is poised to join Redknapp, who will be Saints\' third boss of the season. Redknapp\'s first game in charge will be at home to Middlesbrough on Saturday. Portsmouth chairman Milan Mandaric said he was "disappointed" by the news and claimed Redknapp had been in talks with Southampton for "some time".  "It would appear that negotiations over this have been going on for some time," Mandaric said on Portsmouth\'s official website. "I am surprised and a little shocked that the chairman of Southampton has not picked up the phone and kept me informed." According 

 28%|██▊       | 31/112 [00:43<01:46,  1.31s/it]

news Text 32: ['Blair and Blunkett Sheffield trip  Tony Blair is to join Home Secretary David Blunkett in a visit to Sheffield on Thursday.  Mr Blunkett\'s conduct is being looked at to establish whether he abused his position in relation to his ex-lover. The Parliamentary standards watchdog is looking at his decision to give Kimberly Quinn free rail tickets. He is also being investigated over the visa application of Mrs Quinn\'s ex-nanny. The visit to Sheffield will be seen as a show of unity by Mr Blair.  On Wednesday during Prime Minister\'s Questions, Tory leader Michael Howard went on the offensive over comments Mr Blunkett is alleged to have made in a new biography. He is understood to have made a series of criticisms about his Cabinet colleagues from the prime minister down. Mr Howard said Mr Blunkett had complained he had inherited a "giant mess" when he took over at the Home Office from Jack Straw, now foreign secretary. The Tory leader went on: "He doesn\'t stop there: he thi

 29%|██▊       | 32/112 [00:44<01:55,  1.44s/it]

news Text 33: ['More power to the people says HP  The digital revolution is focused on letting people tell and share their own stories, according to Carly Fiorina, chief of technology giant Hewlett Packard.  The job of firms such as HP now, she said in a speech at the Consumer Electronics Show (CES), was to ensure digital and physical worlds fully converged. She said the goal for 2005 was to make people the centre of technology. CES showcases 50,000 new gadgets that will be hitting the shelves in 2005. The tech-fest, the largest of its kind in the world, runs from 6 to 9 January. "The digital revolution is about the democratisation of technology and the experiences it makes possible," she told delegates. "Revolution has always been about giving power to the people." She added: "The real story of the digital revolution is not just new products, but the millions of experiences made possible and stories that millions can tell." Part of giving people more control has been about the freeing

 29%|██▉       | 33/112 [00:46<01:59,  1.51s/it]

news Text 34: ['Hundreds vie for best film Oscar  A total of 267 films are eligible for the best film Oscar but only five will be chosen to go forward as nominees.  The Academy of Motion Picture, Arts and Sciences has sent out the first ballot papers with the full list of films vying for recognition. Among those expected to receive nominations are The Aviator, Million Dollar Baby and Sideways. Academy members will now vote for their favourites before the final nominees are announced on 25 January.  To be eligible for nomination a film must have been shown in a commercial theatre for seven consecutive days before the deadline of 31 December. Director Martin Scorsese\'s The Aviator, starring Leonardo DiCaprio went on general release on Christmas Day in the US, ensuring it just made the deadline. Studios have already begun lobbying voters, taking out full page adverts in trade publications such as Variety urging them to remember particular films when it comes to choosing what to back. Oth

 30%|███       | 34/112 [00:47<01:51,  1.43s/it]

news Text 35: ['\'Strong dollar\' call halts slide  The US dollar\'s slide against the euro and yen has halted after US Treasury Secretary John Snow said a strong dollar was "in America\'s interest".  But analysts said any gains are likely to be short-lived as problems with the US economy were still significant. They also pointed out that positive comments apart, President George W Bush\'s administration had done little to stop the dollar\'s slide. A weak dollar helps boost exports and narrow the current account deficit. The dollar was trading at $1.2944 against the euro at 2100GMT, still close to the $1.3006 record level set on 10 November. Against the Japanese yen, it was trading at 105.28 yen, after hitting a seven-month low of 105.17 earlier in the day.  Policy makers in Europe have called the dollar\'s slide "brutal" and have blamed the strength of the euro for dampening economic growth. However, it is unclear whether ministers would issue a declaration aimed at curbing the euro\'

 31%|███▏      | 35/112 [00:49<01:56,  1.51s/it]

news Text 36: ['EU-US seeking deal on air dispute  The EU and US have agreed to begin talks on ending subsidies given to aircraft makers, EU Trade Commissioner Peter Mandelson has announced.  Both sides hope to reach a negotiated deal over state aid received by European aircraft maker Airbus and its US rival Boeing, Mr Mandelson said. Airbus and Boeing accuse each other of benefiting from illegal subsidies. Mr Mandelson said the EU and US hoped to avoid having to resolve the dispute at the World Trade Organisation (WTO).  "With this agreement the EU and US have confirmed their willingness to resolve the dispute which has arisen between them," Mr Mandelson said. "I hope our negotiations in the next three months will lead to an agreement ending subsidies to development and production of large civil aircraft." Last year, the US terminated an agreement with the EU, reached in 1992, which limits the subsidies countries can hand over to civil aircraft makers. The US filed a complaint against

 32%|███▏      | 36/112 [00:50<01:49,  1.45s/it]

news Text 37: ['WorldCom bosses\' $54m payout  Ten former directors at WorldCom have agreed to pay $54m (Â£28.85m), including $18m from their own pockets, to settle a class action lawsuit, reports say.  James Wareham, a lawyer representing one of the directors, told Reuters the 10 had agreed to pay those who lost billions when the firm collapsed. The remaining $36m will be paid by the directors\' insurers. But, a spokesman for the prosecutor, New York State Comptroller Alan Hevesi, said no formal agreement had been made.  Corporate governance experts said that if the directors do dip into their own pockets for the settlement, it will set a new standard for the accountability of bosses, when the firms they oversee face problems.  "Directors very rarely pay," said Charles Elson, chairman of the Center for Corporate Governance at the University of Delaware. He added that the settlement "sends a pretty strong shockwave through the director world". A formal agreement on the payout is expect

 33%|███▎      | 37/112 [00:52<01:49,  1.46s/it]

news Text 38: ['Hewitt falls to Dent  Lleyton Hewitt suffered a shock defeat to Taylor Dent in the quarter-finals of the Australian Hardcourt Championships in Adelaide on Friday.  The top seed was a strong favourite for the title but went down 7-6 (7-4) 6-3 to the American. Dent will face Juan Ignacio Chela next after the fourth seed was too strong for Jurgen Melzer. Olivier Rochus beat third seed Nicolas Kiefer 6-7 (4-7) 7-6 (8-6) 7-5 and will take on second seed Joachim Johansson. The Swede reached the last four by beating compatriot Thomas Enqvist 6-3 4-6 6-1. "I felt like I was striking the ball much better," said Johansson. "I felt like I had a lot of break chances, I didn\'t take care of them all, but I broke him four times and he only broke me once. "I felt that was the key to get up in the set early."']
Summarized Text 38: ['Lleyton Hewitt beaten in quarter-finals of Australian Hardcourt Championships. Top seed beaten 7-6 (7-4) 6-3 by American Taylor Dent. Dent will face Juan I

 34%|███▍      | 38/112 [00:53<01:47,  1.46s/it]

news Text 39: ['Apple laptop is \'greatest gadget\'  The Apple Powerbook 100 has been chosen as the greatest gadget of all time, by US magazine Mobile PC.  The 1991 laptop was chosen because it was one of the first "lightweight" portable computers and helped define the layout of all future notebook PCs. The magazine has compiled an all-time top 100 list of gadgets, which includes the Sony Walkman at number three and the 1956 Zenith remote control at two. Gadgets needed moving parts and/or electronics to warrant inclusion. The magazine staff compiled the list and specified that gadgets also needed to be a "self-contained apparatus that can be used on its own, not a subset of another device".  "In general we included only items that were potentially mobile," said the magazine.  "In the end, we tried to get to the heart of what really makes a gadget a gadget," it concluded. The oldest "gadget" in the top 100 is the abacus, which the magazine dates at 190 A.D., and put in 60th place. Other

 35%|███▍      | 39/112 [00:55<01:58,  1.62s/it]

news Text 40: ['Strachan turns down Pompey  Former Southampton manager Gordon Strachan has rejected the chance to become Portsmouth\'s new boss.  The Scot was Pompey chairman Milan Mandaric\'s first choice to replace Harry Redknapp, who left Fratton Park for rivals Saints earlier in December. "I think it\'s a fantastic job for anybody apart from somebody who has just been the Southampton manager," Strachan told the BBC. Club director Terry Brady held initial talks with Strachan on Saturday. The former Scotland international added that joining Southampton\'s local rivals would not be a wise move. "It\'s got everything going for it but I\'ve got too many memories of the other side and I don\'t want to sour those memories," he said. "Everything\'s right - it\'s 10 minutes away, there are good players there, a good set-up, a good atmosphere at the ground. "There\'s lots to do but it\'s not right for somebody who has just been the Southampton manager." Since Redknapp\'s departure, executive

 36%|███▌      | 40/112 [00:56<01:49,  1.53s/it]

news Text 41: ['Wi-fi web reaches farmers in Peru  A network of community computer centres, linked by wireless technology, is providing a helping hand for poor farmers in Peru.  The pilot scheme in the Huaral Valley, 80 kilometres north of the capital Lima, aims to offer the 6,000-strong community up-to-date information on agricultural market prices and trends. The Agricultural Information Project for Farmers of the Chancay-Huaral Valley also provides vital links between local organisations in charge of water irrigation, enabling them to coordinate their actions. More than 13,000 rural inhabitants, as well as 18,000 students in the region, will also benefit from the telecoms infrastructure.  The 14 telecentres uses only free open source software and affordable computer equipment. The network has been three years in the making and was officially inaugurated in September.  The non-government organisation, Cepes (Peruvian Centre for Social Studies) led the $200,000 project, also backed by

 37%|███▋      | 41/112 [00:58<01:51,  1.57s/it]

news Text 42: ['UK needs tax cuts, Tories insist  A major change of direction is needed in Britain if it is to prosper, the shadow chancellor said as the Tory Party spring conference began.  Oliver Letwin said the UK could not compete with other countries without the Â£4bn tax cuts he was promising. Tory co-chairman Liam Fox had opened the forum in Brighton with an attack on Labour\'s record and party leader Michael Howard is due to speak later. Tony Blair has said Conservative policies would cause economic failure. But Mr Letwin said Britain had fallen from fourth to 11th in the international economic competitiveness league.  "Can this country compete, can this country prosper, unless we do something about the burden of regulation and tax on our economy?" he said. "If we are going to take on the great challenges, the challenges like those posed by the Chinese and the Indians, we have got to do something about getting down the burden of regulation and getting down the burden of tax," h

 38%|███▊      | 42/112 [00:59<01:44,  1.49s/it]

news Text 43: ['Mild winter drives US oil down 6%  US oil prices have fallen by 6%, driven down by forecasts of a mild winter in the densely populated northeast.  Light crude oil futures fell $2.86 to $41.32 a barrel on the New York Mercantile Exchange (Nymex), and have now lost $4 in five days. Nonetheless, US crude is still 30% more expensive than at the beginning of 2004, boosted by growing demand and bottlenecks at refineries. Traders ignored the possible effects of Asia\'s tidal waves on global supplies.  Instead, the focus is now on US consumption, which is heavily influenced in the short term by the weather. "With the revised milder temperatures... I\'m more inclined to think we\'ll push lower and test the $40-40.25 range," said John Brady of ABN AMRO. "The market definitely feels to be on the defensive." Statistics released last week showed that stockpiles of oil products in the US had risen, an indication that severe supply disruptions may not arise this winter, barring any se

 38%|███▊      | 43/112 [01:01<01:38,  1.42s/it]

news Text 44: ['Edu describes tunnel fracas  Arsenal\'s Edu has lifted the lid on the scenes that followed Manchester United\'s win over the Gunners.  The Brazilian confirmed tempers had flared but could shed no light on reports that food was thrown at United boss Sir Alex Ferguson. "I saw people being pulled apart, people pushing, pointing and shouting," he told Uefa\'s official website. "The United players were trying to wind us up about the result but I didn\'t see any soup being thrown at anyone." However, Edu tried to play down the incidents, adding: "There was nothing that I haven\'t seen in Brazilian derbies. "Derby matches in Brazil are worse. I like to play in games like this with this intense rivalry." But Edu was highly critical of the ferocity of some of United\'s challenges during the game, particularly on Jose Antonio Reyes. "I think we were a lot fairer in the tackles than United," he said. "Reyes was being kicked all over the park - they were beating up the boy and Gary

 39%|███▉      | 44/112 [01:02<01:32,  1.35s/it]

news Text 45: ['Robots learn \'robotiquette\' rules  Robots are learning lessons on "robotiquette" - how to behave socially - so they can mix better with humans.  By playing games, like pass-the-parcel, a University of Hertfordshire team is finding out how future robot companions should react in social situations. The study\'s findings will eventually help humans develop a code of social behaviour in human-robot interaction. The work is part of the European Cogniron robotics project, and was on show at London\'s Science Museum.  "We are assuming a situation in which a useful human companion robot already exists," said Professor Kerstin Dautenhahn, project leader at Hertfordshire. "Our mission is to look at how such a robot should be programmed to respect personal spaces of humans."  The research also focuses on human perception of robots, including how they should look, and how a robot can learn new skills by imitating a human demonstrator. "Without such studies, you will build robots 

 40%|████      | 45/112 [01:03<01:30,  1.35s/it]

news Text 46: ['Podcasts mark rise of DIY radio  An Apple iPod or other digital music players can hold anything up to 10,000 songs, which is a lot of space to fill.  But more and more iPod owners are filling that space with audio content created by an unpredictable assortment of producers. It is called "podcasting" and its strongest proponent is former MTV host and VJ (video jockey) Adam Curry. Podcasting takes its name from the Apple iPod, although you do not need an iPod to create one or to listen to a podcast. A podcast is basically an internet-based radio show which podcasters create, usually in the comfort of their own home. They need only a microphone, a PC, and some editing software. They then upload their shows to the internet and others can download and listen to them, all for free. Using technology based on XML computer code and RSS - Really Simple Syndication - listeners can subscribe to podcasts collected automatically in a bit of software, which Mr Curry has pioneered. The

 41%|████      | 46/112 [01:05<01:38,  1.49s/it]

news Text 47: ['Apple sues to stop product leaks  Computer firm Apple has issued a lawsuit to prevent online leaks of information about future products.  The lawsuit, against an unidentified individual, comes just weeks before the MacWorld conference in San Francisco, used to showcase new products. The complaint said an "unidentified individual... has recently misappropriated and disseminated confidential information". The lawsuit was filed with the Santa Clara California Superior Court. Apple is famously secretive about its future product launches while Apple users are equally famous for speculating about new technology from the company. Fans have speculated in recent weeks about the possibility of a new type of iPod being announced at the MacWorld conference.  Apple said in the seven-page complaint, filed on 13 December, that it did not know the "true names or capacities, whether individual, associate, corporate or otherwise," of the defendants. The company said it would amend the co

 42%|████▏     | 47/112 [01:06<01:34,  1.45s/it]

news Text 48: ["Chinese exports rise 25% in 2004  Exports from China leapt during 2004 over the previous year as the country continued to show breakneck growth.  The spurt put China's trade surplus - a sore point with some of its trading partners - at a six-year high. It may also increase pressure on China to relax the peg joining its currency, the yuan, with the weakening dollar. The figures released by the Ministry of Commerce come as China's tax chief confirmed that growth had topped 9% in 2004 for the second year in a row. State Administration of Taxation head Xie Xuren said a tightening of controls on tax evasion had combined with the rapid expansion to produce a 25.7% rise in tax revenues to 2.572 trillion yuan ($311bn; Â£165bn).  According to the Ministry of Commerce, China's exports totalled $63.8bn in December, taking the annual total up 35.4% to $593.4bn. With imports rising a similar amount, the deficit rose to $43.4bn. The increased tax take comes despite healthy tax rebate

 43%|████▎     | 48/112 [01:08<01:29,  1.40s/it]

news Text 49: ['French boss to leave EADS  The French co-head of European defence and aerospace group EADS Philippe Camus is to leave his post.  Mr Camus said in a statement that he has accepted the invitation to return full-time to the Lagardere group, which owns 30% of EADS. "I will give up my role as soon as the board of directors asks me to do so," he said. Airbus head Noel Forgeard is now set to replace Mr Camus, bringing the company\'s power struggle to an end. Fighting between Mr Camus and Mr Forgeard has hit the headlines in France and analysts feared that this fighting could destabilise the defence and aerospace group. French finance minister Herve Gaymard is on record as saying that he "deplored" the infighting at the company. The company should now be able put this dispute behind it, with the departure of Mr Camus and with the clear support given to Mr Forgeard by the Lagardere group, the main French shareholder of EADS. The other main shareholders of EADS are the French gov

 44%|████▍     | 49/112 [01:09<01:29,  1.41s/it]

news Text 50: ['Text message record smashed  UK mobile owners continue to break records with their text messaging, with latest figures showing that 26 billion texts were sent in total in 2004.  The figures collected by the Mobile Data Association (MDA) showed that 2.4 billion were fired off in December alone, the highest monthly total ever. That was 26% more than in December 2003. The records even surpassed the MDA\'s own predictions, it said. Every day 78 million messages are sent and there are no signs of a slow down. Before December\'s bumper text record, the previous highest monthly total was in October 2004, when 2.3 billion were sent. Text messaging is set to smash more records in 2005 too, said the MDA, with forecasts suggesting a total of 30 billion for the year.  Even though mobiles are becoming increasingly sophisticated with much more multimedia applications, texting is still one of the most useful functions of mobiles. People are using SMS to do much more too. Booking cinem

 45%|████▍     | 50/112 [01:11<01:27,  1.40s/it]

news Text 51: ['Godzilla gets Hollywood fame star  Movie monster Godzilla has received a star on Hollywood\'s Walk of Fame, honouring both his 50th birthday and the launch of his 28th film.  An actor dressed as the giant creature breathed smoke over photographers on Monday as Godzilla received the 2,271st star on Hollywood Boulevard. "Godzilla should thank you for this historical and monumental star," said Final Wars producer Shogo Tomiyama. "But unfortunately, he cannot speak English," he added. Hollywood\'s honorary mayor, Johnny Grant, said: "I do hereby proclaim this Godzilla Day in Hollywood.  "He\'s loose, he\'s wild, and I\'m getting the hell out of here," he added. The premiere of Godzilla: Final Wars at Grauman\'s Chinese Theatre followed the ceremony on Hollywood Boulevard. The monster was joined by co-stars including Japanese pop star and actor Masahiro Matsuoka. Director Ryuhei Kitamura said it may not be Godzilla\'s final outing, as it has been billed. "That\'s what the pr

 46%|████▌     | 51/112 [01:12<01:21,  1.34s/it]

news Text 52: ['Christmas sales worst since 1981  UK retail sales fell in December, failing to meet expectations and making it by some counts the worst Christmas since 1981.  Retail sales dropped by 1% on the month in December, after a 0.6% rise in November, the Office for National Statistics (ONS) said. The ONS revised the annual 2004 rate of growth down from the 5.9% estimated in November to 3.2%. A number of retailers have already reported poor figures for December. Clothing retailers and non-specialist stores were the worst hit with only internet retailers showing any significant growth, according to the ONS.  The last time retailers endured a tougher Christmas was 23 years previously, when sales plunged 1.7%.  The ONS echoed an earlier caution from Bank of England governor Mervyn King not to read too much into the poor December figures. Some analysts put a positive gloss on the figures, pointing out that the non-seasonally-adjusted figures showed a performance comparable with 2003

 46%|████▋     | 52/112 [01:13<01:18,  1.31s/it]

news Text 53: ['\'No-one can define new hunt ban\'  The new law banning hunting with dogs is "so poorly drafted" no-one can define the offence, pro-hunt MPs say.  The accusation came after it emerged a Devon man had been told he could use his four dogs to "chase away unwanted animals" from his farm. Because he did not intend to kill deer or foxes it was not hunting. Lib Dem MP Lembit Opik said ministers had invented a new category of hunting - chasing away - and asked how police were supposed to interpret the rules.  North Devon landowner Giles Bradshaw was put in touch with the Middle Way Group, of which Mr Opik is a co-chairman, after he had been in contact with the rural affairs ministry, Defra. He had asked whether his technique of using his four dogs to frighten off deer and foxes would be outlawed under the Hunting Act. Mr Bradshaw was initially told it was an offence - prompting him to complain. The Middle Way group also said Mr Bradshaw would be put in a position where he would

 47%|████▋     | 53/112 [01:14<01:20,  1.36s/it]

news Text 54: ['Soaring oil \'hits world economy\'  The soaring cost of oil has hit global economic growth, although world\'s major economies should weather the storm of price rises, according to the OECD.  In its latest bi-annual report, the OECD cut its growth predictions for the world\'s main industrialised regions. US growth would reach 4.4% in 2004, but fall to 3.3% next year from a previous estimate of 3.7%, the OECD said. However, the Paris-based economics think tank said it believed the global economy could still regain momentum.  Forecasts for Japanese growth were also scaled back to 4.0% from 4.4% this year and 2.1% from 2.8% in 2005. But the outlook was worst for the 12-member eurozone bloc, with already sluggish growth forecasts slipping to 1.8% from 2.0% this year and 1.9% from 2.4% in 2005, the OECD said. Overall, the report forecast total growth of 3.6% in 2004 for the 30 member countries of the OECD, slipping to 2.9% next year before recovering to 3.1% in 2006. "There a

 48%|████▊     | 54/112 [01:16<01:21,  1.40s/it]

news Text 55: ['Hillbillies singer Scoggins dies  Country and Western musician Jerry Scoggins has died in Los Angeles at the age of 93, his family has said.  Scoggins was best remembered for singing the theme tune to popular US TV show The Beverly Hillbillies. The Texan-born singer approached the producers of the programme with theme tune The Ballad of Jed Clampett for the pilot which was screened in 1962. The show, which told the story of a poor man striking oil and moving to Beverly Hills, ran until 1971.  Scoggins\' daugher Jane Kelly Misel said that her father never tired of the song and would sing it at least once a day. "He\'d sing it at birthdays and anniversaries and variety shows. He never stopped performing it," she said. When a film version of The Beverly Hillbillies was made in 1993, Scoggins came out of retirement to perform the theme tune. Scoggins sang the lyrics while bluegrass stars Lester Flatt and Earl Scruggs played guitar and banjo.']
Summarized Text 55: ['Jerry Sc

 49%|████▉     | 55/112 [01:17<01:18,  1.37s/it]

news Text 56: ['Kilroy unveils immigration policy  Ex-chatshow host Robert Kilroy-Silk has attacked UK policy on immigration saying Britain\'s open door approach is hitting low wage "indigenous" workers.  The Veritas leader said the only people to benefit from immigrants from places like Poland were employers, landlords, members of the \'metropolitan elite\'. The MEP said his party would only admit foreigners who were required because they had specific skills to offer. And he argued asylum cost Â£2bn a year for 14,000 successful applicants.  Mr Kilroy-Silk said that worked out at Â£143,000 per successful asylum seeker. He said Veritas wanted to grant an amnesty for all those in Britain claiming asylum and who have children and deport everyone else. Britain should take its fair share of asylum seekers under the United Nations Convention on Human Rights, he argued. And Mr Kilroy-Silk said he wanted to spend an extra Â£500m a year to help provide for refugees abroad.']
Summarized Text 56:

 50%|█████     | 56/112 [01:19<01:15,  1.35s/it]

news Text 57: ['Candidate resigns over BNP link  A prospective candidate for the UK Independence Party (UKIP) has resigned after admitting a "brief attachment" to the British National Party(BNP).  Nicholas Betts-Green, who had been selected to fight the Suffolk Coastal seat, quit after reports in a newspaper that he attended a BNP meeting. The former teacher confirmed he had attended the meeting but said that was the only contact he had with the group. Mr Betts-Green resigned after being questioned by the party\'s leadership. A UKIP spokesman said Mr Betts-Green\'s resignation followed disclosures in the East Anglian Daily Times last month about his attendance at a BNP meeting. "He did once attend a BNP meeting. He did not like what he saw and heard and will take no further part of it," the spokesman added. A meeting of Suffolk Coastal UKIP members is due to be held next week to discuss a replacement. Mr Betts-Green, of Woodbridge, Suffolk, has also resigned as UKIP\'s branch chairman.

 51%|█████     | 57/112 [01:20<01:12,  1.33s/it]

news Text 58: ['Asylum children to face returns  The UK government is planning to return asylum seeker children without parents to Albania.  The trial scheme, which could start in weeks, may be extended to apply to children from other countries. Children\'s charities have reacted with alarm, saying the policy amounts to forcible removal and may not guarantee the safety of those affected. But the Home Office says it may be in the children\'s best interests if it reunites them with their communities.  The pilot, included in the government\'s five-year immigration plan, aims to return unaccompanied asylum-seeking children from Albania who have failed in their asylum claims.  Since 2002, at least 9,000 under-18s have arrived in the UK to seek asylum without other family members. These children automatically become the responsibility of social services. Up to now, ministers have held back from final removal orders against unaccompanied children until after they are legally adults at 18. At 

 52%|█████▏    | 58/112 [01:21<01:16,  1.42s/it]

news Text 59: ['France Telecom gets Orange boost  Strong growth in subscriptions to mobile phone network Orange has helped boost profits at owner France Telecom.  Orange added more than five million new customers in 2004, leading to a 10% increase in its revenues. Increased take-up of broadband telecoms services also boosted France Telecom\'s profits, which showed a 5.5% rise to 18.3bn euros ($23.4bn; Â£12.5bn). France Telecom is to spend 578m euros on buying out minority shareholders in data services provider Equant.  France Telecom, one of the world\'s largest telecoms and internet service providers, saw its full-year sales rise 2.2% to 47.2bn euros in 2004.  Orange enjoyed strong growth outside France and the United Kingdom - its core markets - swelling its subscriber base to 5.4 million. France Telecom\'s broadband customers also increased, rising to 5.1 million across Europe by the end of the year. The firm said it had met its main strategic objectives of growing its individual bu

 53%|█████▎    | 59/112 [01:23<01:15,  1.42s/it]

news Text 60: ["Survey confirms property slowdown  Government figures have confirmed a widely reported slowdown of the UK's housing market in late 2004.  House prices were 11.8% higher on the year in the last quarter of 2004, down from 16.3% in the July-to-September quarter, the Land Registry said. The average house price in England and Wales was Â£182,920, down from Â£187,971 in July-September. The volume of sales between October and December dropped by nearly a quarter from the same period in 2003. The government figures are the first official confirmation of falls in the market at the end of 2004. Land Registry figures are less up to date than those of banks and building societies, since they record completions not mortgage approvals. However, the figures are viewed as the most accurate measure of house prices as they include all property transactions, including cash sales.  The cost of buying a home fell in seven out of 10 regions between the third and fourth quarters of 2004.  The

 54%|█████▎    | 60/112 [01:24<01:14,  1.43s/it]

news Text 61: ['Vodafone appoints new Japan boss  Vodafone has drafted in its UK chief executive William Morrow to take charge of its troubled Japanese operation.  Mr Morrow will succeed Shiro Tsuda as president of Vodafone KK, Japan\'s number three mobile operator, in April. Mr Tsuda, who will become chairman, was appointed president only two months ago but the business has struggled since then, losing customers in January. Vodafone had pinned its hopes on the launch of its 3G phones in November but demand for them has been slow.  While it has more than 15 million customers in Japan, Vodafone has found it difficult to satisfy Japan\'s technologically demanding mobile users. It suffered a net loss of more than 58,000 customers in January, its second monthly reverse in the last year. "Vodafone is going to need to put a lot of money into Japan if it wants to rebuild the business," Tetsuro Tsusaka, a telecoms analyst with Deutsche Bank, told Reuters. "I do not know if it will be worth it 

 54%|█████▍    | 61/112 [01:26<01:09,  1.37s/it]

news Text 62: ['UK\'s National Gallery in the pink  The National Gallery, home to some of the UK\'s greatest artworks, has seen a big jump in visitor numbers.  Five million visitors made the London gallery - which houses treasures like Raphael\'s Madonna of the Pinks - the UK\'s most visited museum in 2004. It recorded a 13.8% rise in numbers and was the country\'s second most visited tourist attraction, behind Blackpool Pleasure Beach. Charles Saumarez Smith, the gallery\'s director, said he was "delighted". He said the number of visitors through the doors had boosted figures to pre-11 September 2001 levels. Mr Saumarez Smith added that the pedestrianisation of Trafalgar Square, where the gallery is located, and strong temporary collections throughout 2004 had led to the strong performance.  "Our 2004 exhibition programme of El Greco, Russian Landscape in the Age of Tolstoy and Raphael: From Urbino to Rome was particularly strong and exceeded all targets," he said. "The exceptional qu

 55%|█████▌    | 62/112 [01:27<01:08,  1.36s/it]

news Text 63: ['Versace art portfolio up for sale  The art collection of murdered fashion designer Gianni Versace could fetch up to Â£9m ($17m) when it is auctioned in New York and London later this year.  Among the pictures for sale are works by Roy Lichtenstein, Andy Warhol and Henri Matisse. The collection was housed at Versace\'s six-storey New York townhouse. The 51-year-old designer was shot outside his Florida home in 1997 by suspected serial killer Andrew Cunanan, who later killed himself. The auction, at Sotheby\'s, will feature 45 contemporary, impressionist and 19th Century paintings. One of the highlights of the sale is Roy Lichtenstein\'s Blue Nude which has been given an estimate of Â£1.8m ($3.4m).  Tobias Meyer, Sotheby\'s worldwide head of contemporary art, said: "This collection reflects Mr Versace\'s wide-ranging taste and impeccable eye, and many of the works were commissioned directly from the artists. "Outstanding later examples from champions of the Pop movement, 

 56%|█████▋    | 63/112 [01:28<01:07,  1.39s/it]

news Text 64: ['G7 backs Africa debt relief plan  G7 finance ministers have backed plans to write off up to 100% of the debts of some of the world\'s poorest countries.  UK chancellor Gordon Brown said the London meeting of the world\'s seven richest nations would be remembered as "the 100% debt relief summit". Some 37 countries could benefit after a case-by-case review by bodies including the World Bank and the IMF, he said. But the US says it cannot support Mr Brown\'s International Finance Facility to boost aid to developing countries. BBC correspondents said the meeting had produced some movement towards the UK\'s ambitions, but much work was needed. Mr Brown said it was a major breakthrough for the international organisations to offer up to 100% multilateral debt relief - "the vast bulk" of money owed by the poorest countries.  "We could be at the beginning of the final stage of the process where the debts that were owed by the poorest countries, built up over 20 or 30 years, debt

 57%|█████▋    | 64/112 [01:30<01:14,  1.55s/it]

news Text 65: ['Lennon brands Rangers favourites  Celtic\'s Neil Lennon admits Rangers could be considered "slight favourites" for the Old Firm CIS Cup clash, but insists his side can still win.  Lennon concedes Rangers are in good form at the moment, but they have failed to beat Celtic in their last seven meetings. "Rangers are on the up and have been on a good run in recent weeks," he said. "But it\'s a game we believe we\'re capable of winning if we play our best," he told the Evening Times. "All the boys are looking forward to it because they are brilliant games to be involved in. "Without playing at the top of our game, we have still been winning matches. "At the minute, we are at the top of the league and still in with a chance of staying in Europe, so I don\'t think it is the crisis people have been trying to make out. "Of course, it is a concern when you are losing goals, because we have been notorious for being a team that is hard to beat and keeping clean sheets, but hopefull

 58%|█████▊    | 65/112 [01:31<01:07,  1.43s/it]

news Text 66: ['Elvis fans hold birthday bash  Elvis fans around the world have been marking the legendary singer\'s 70th birthday on Saturday.  A three-day Elvis convention took place in Blackpool, England, over the weekend with the aim of finding the best European Elvis impersonator. His Graceland, Tennessee, home was the focus for US celebrations with four days of events including a concert by the Memphis Symphony Orchestra. Elvis\' single Jailhouse Rock became the UK\'s number one on Sunday. Fans in France celebrated with a tribute concert by Elvis cover bands and a special exhibition of memorabilia is on display in Bonn, Germany.  Jailhouse Rock is now the 999th number one single in UK pop history. Record company SonyBMG are releasing Elvis\' 18 number one singles at the rate of one a week in Britain, complete with original artwork and a collector\'s box. Hit single One Night will follow next week - with the chance of becoming the 1,000th number one as interest surrounding Elvis\'

 59%|█████▉    | 66/112 [01:33<01:03,  1.38s/it]

news Text 67: ['US consumer confidence up  Consumers\' confidence in the state of the US economy is at its highest for five months and they are optimistic about 2005, an influential survey says.  The feel-good factor among US consumers rose in December for the first time since July according to new data. The Conference Board survey of 5,000 households pointed to renewed optimism about job creation and economic growth. US retailers have reported strong sales over the past 10 days after a slow start to the crucial festive season.  According to figures also released on Tuesday, sales in shopping malls in the week to 25 December were 4.3% higher than in 2003 following a last minute rush. Wal-Mart, the largest US retailer, has said its December sales are expected to be better than previously forecast because of strong post-Christmas sales.  It is expecting annual sales growth of between 1% and 3% for the month. Consumer confidence figures are considered a key economic indicator because cons

 60%|█████▉    | 67/112 [01:34<01:00,  1.35s/it]

news Text 68: ["Ukraine steel sell-off 'illegal'  The controversial sell-off of a Ukrainian steel mill to a relative of the former president was illegal, a court has ruled.  The mill, Krivorizhstal, was sold in June 2004 for $800m (Â£424m) - well below other offers. President Viktor Yushchenko, elected in December, is planning to revisit many of Ukraine's recent privatisations. Krivorizhstal is one of dozens of firms which he says were sold cheaply to friends of the previous administration.  On Wednesday, Prime Minister Yulia Tymoshenko said as many as 3,000 firms could be included on the list of firms whose sale was being reviewed.  Mr Yushchenko had previously said the list would be limited to 30-40 enterprises. More than 90,000 businesses in all, from massive corporations to tiny shopfronts, have been sold off since 1992, as the command economy built up when Ukraine was part of the Soviet Union was dismantled. Analysts have suggested that the government needs to avoid the impression

 61%|██████    | 68/112 [01:35<00:58,  1.32s/it]

news Text 69: ['Tutu\'s Guantanamo release call  Archbishop Desmond Tutu has called for the release of the remaining inmates at Guantanamo Bay and terror suspects detained without trial in the UK.  His comments follow news that all four Britons held by the US in the Cuban camp will be freed within weeks. The South African archbishop said detentions without trial were "unacceptable" and "distressing". Twelve foreign nationals are being held indefinitely without trial in the UK under anti-terror laws. Referring to the detentions in Cuba, Archbishop Tutu told BBC News: "It is utterly unacceptable. "The rule of law is in order to ensure that those who have power don\'t use their power arbitrarily and every person retains their human rights until you have proven conclusively that so-and-so is in fact guilty."  Moazzam Begg, from Birmingham, and Martin Mubanga, Richard Belmar and Feroz Abbasi, from London, have been held by the US at Guantanamo Bay for almost three years. On Tuesday Foreign 

 62%|██████▏   | 69/112 [01:37<01:00,  1.41s/it]

news Text 70: ['Fit-again Betsen in France squad  France have brought flanker Serge Betsen back into their squad to face England at Twickenham on Sunday.  But the player, who missed the victory over Scotland through injury, must attend a disciplinary hearing on Wednesday after being cited by Wasps. "Serge has a good case so we are confident he will play," said France coach Bernard Laporte. The inexperienced Nicolas Mas, Jimmy Marlu and Jean-Philippe Grandclaude are also included in a 22-man squad. The trio have been called up after Pieter de Villiers, Ludovic Valbon and Aurelien Rougerie all picked up injuries in France\'s 16-9 win on Saturday.  Laporte said he was confident that Betsen would be cleared by the panel investigating his alleged trip that broke Wasps centre Stuart Abbott\'s leg. "If he was to be suspended, we would call up Imanol Harinordoquy or Thomas Lievremont," said Laporte, who has dropped Patrick Tabacco. "We missed Serge badly against Scotland. He has now recovered 

 62%|██████▎   | 70/112 [01:38<00:59,  1.41s/it]

news Text 71: ['Strike threat over pension plans  Millions of public service workers could strike if ministers scrap their final salary pension scheme and make them work longer, warn union leaders.  The Cabinet Office has confirmed it is reviewing the current pension system, prompting unions representing 4.5m workers to threaten united action. They believe the plans include raising the mandatory retirement age for public service workers from 60 to 65. The government says unions will be consulted before any changes are made.  It is thought the proposed overhaul, due on Thursday, could mean pensions could be based on a "career average" salary. For each year served, staff currently get one eightieth of their highest salary in the final three years. Ministers will be anxious to avoid mass strike action in the lead-up to the next general election, which is widely expected next May. In a statement on Sunday, the Cabinet Office said it was reviewing the Civil Service Pension Scheme, and hoped

 63%|██████▎   | 71/112 [01:40<00:58,  1.42s/it]

news Text 72: ['Brown hits back in Blair rift row  Gordon Brown has criticised a union leader who said conflict between himself and Tony Blair was harming the workings of government.  Jonathan Baume, of the top civil servants\' union, spoke of "competing agendas" between Mr Brown and Mr Blair. But the chancellor said Mr Baume was never at meetings between himself and the prime minister so could not judge. He said the union leader was trying to block civil service reform which threatened his members\' jobs. It suited the purpose of Mr Baume\'s union, the First Division Association, to suggest there were two agendas battling against each other because the union was trying to resist the planned reforms, Mr Brown told BBC Radio 4\'s Today programme.  Under the plans, unveiled in the Gershon report, some 84,000 civil servants jobs will be axed or changed and the savings ploughed back into frontline services. Mr Brown said: "To be honest I don\'t think you can rely on his [Mr Baume\'s] judge

 64%|██████▍   | 72/112 [01:41<00:56,  1.42s/it]

news Text 73: ['Solutions to net security fears  Fake bank e-mails, or phishing, and stories about ID theft are damaging the potential of using the net for online commerce, say e-business experts.  Trust in online security is falling as a result. Almost 70% of those asked in a poll said that net firms are not doing enough to protect people. The survey of more than 1,000 people reported that 43% were not willing to hand over personal information online. It is worrying for shopaholics and firms who want to exploit the net. More people are becoming aware of online security issues but they have little confidence that companies are doing enough to counter the threats, said security firm RSA, which carried out the poll. An estimated 12 million Britons now use the net as a way of managing their financial affairs. Security experts say that scare stories and the vulnerabilities dogging e-commerce and e-banking are being taken seriously - by banks in particular.  "I don\'t think the threat is ov

 65%|██████▌   | 73/112 [01:43<01:00,  1.54s/it]

news Text 74: ['Beckham relief as Real go through  David Beckham expressed his relief at Real Madrid\'s passage to the Champions League knockout phase.  After Real\'s 3-0 win at Roma, the England skipper admitted another season of under-achievement would not be tolerated at the Bernabeu stadium. Beckham said: "It\'s expected of Madrid to get through, but it\'s a relief for the club and players to have won. "We lost momentum last season but we cannot afford to to go another season without winning anything." Real\'s finish as runners-up in their Champions League group means they cannot face his old club Manchester United in the next round. But Real could be drawn against other Premiership hopefuls, Arsenal or Chelsea, who won their respective groups. "It\'s going to be great whoever we play, even if we don\'t get either of the two English teams."']
Summarized Text 74: ['Beckham relief as Real go through  Champions League knockout phase. England skipper admitted another season of under-ac

 66%|██████▌   | 74/112 [01:44<00:53,  1.40s/it]

news Text 75: ['Cash gives way to flexible friend  Spending on credit and debit cards has overtaken cash spending in the UK for the first time.  The moment that plastic finally toppled cash happened at 10.38am on Wednesday, according to the Association for Payment Clearing Services (Apacs) Apacs chose school teacher Helen Carroll, from Portsmouth, to make the historic transaction. The switch over took place as she paid for her groceries in the supermarket chain Tesco\'s Cromwell Road branch.  Mrs Carroll was born in the same year that plastic cards first appeared in the UK. "I pay for most things with my debit card, with occasional purchases on one of my credit cards," said Mrs Carroll, who teaches at Peel Common Infants School in Gosport.  Spending patterns for the year and estimates for December led Apacs to conclude that 10.38am was the time that plastic would finally rule the roost. Shoppers in the UK are expected to put Â£269bn on plastic cards during the whole of 2004, compared w

 67%|██████▋   | 75/112 [01:45<00:50,  1.38s/it]

news Text 76: ["O'Driscoll out of Scotland game  Ireland captain Brian O'Driscoll has been ruled out of Saturday's RBS Six Nations clash against Scotland.  O'Driscoll was originally named in the starting line-up but has failed to recover from the hamstring injury he picked up in the win over Italy. His replacement will be named after training on Friday morning. Fellow centre Gordon D'Arcy is also struggling with a hamstring injury and he will undergo a fitness test on Friday to see if he can play.  Kevin Maggs would be an obvious replacement at centre while Shane Horgan could also be moved from wing. Ulster wing Tommy Bowe could also be asked to travel with the squad to Scotland as a precautionary measure. The only other change to the Ireland side sees Wasps flanker Johnny O'Connor replacing Denis Leamy. O'Connor will be winning his third cap after making his debut in the victory over South Africa last November.  : Murphy, Horgan, TBC, D'Arcy, Hickie, O'Gara, Stringer, Corrigan, Byrne,

 68%|██████▊   | 76/112 [01:47<00:49,  1.37s/it]

news Text 77: ["Man Utd through after Exeter test  Manchester United avoided an FA Cup upset by edging past Exeter City in their third round replay.  Cristiano Ronaldo scored the opener, slipping the ball between Paul Jones' legs after just nine minutes. United wasted a host of chances to make it safe as Jones made some great saves, but Wayne Rooney put the tie beyond doubt late on with a cool finish. Exeter had chances of their own, Sean Devine twice volleying wide and Andrew Taylor forcing Tim Howard to save. United boss Sir Alex Ferguson was taking few chances after their 0-0 draw in the first game and he handed starts to Paul Scholes and Ryan as well as Ronaldo and Rooney. Exeter began brightly with Devine and Steve Flack seeing plenty of the ball, but it did not take United long to assert their authority and the hosts soon found themselves a goal down. Scholes played a lovely pass in to Ronaldo on the left-hand side of the six-yard box and the Portuguese winger slid the ball betwe

 69%|██████▉   | 77/112 [01:48<00:49,  1.42s/it]

news Text 78: ['Casino Royale is next Bond movie  Casino Royale, author Ian Fleming\'s first James Bond book, is to be the next Bond film, with Goldeneye director Martin Campbell behind the camera.  It will be the 21st James Bond film to hit the big screen, and speculation has been rife over who will play the lead. Casino Royale was turned into a spoof spy movie by John Huston in 1967, with David Niven in the lead role. Pierce Brosnan led the past four Bond films but said producers axed him after offering him the chance to return. Among the favourites to take over the coveted role are Scottish actor Dougray Scott, Oscar nominee Clive Owen and Australian star Hugh Jackman. Producers say no decision has yet been made on who will become the seventh actor, including Niven, to play Bond on film. Kill Bill director Quentin Tarantino had talked of wanting to take on the Casino Royale project, and said he had spoken to Brosnan about it.  Shooting on Casino Royale is expected to begin once Camp

 70%|██████▉   | 78/112 [01:50<00:48,  1.43s/it]

news Text 79: ['Feta cheese battle reaches court  A row over whether only Greece should be allowed to label its cheese feta has reached the European Court of Justice.  The Danish and German governments are challenging a European Commission ruling which said Greece should have sole rights to use the name. The Commission\'s decision gave the same legal protection to feta as to Italian Parma ham and French Champagne. But critics of the judgement say feta is a generic term, with the cheese produced widely outside Greece.  The Commission\'s controversial 2002 ruling gave "protected designation of origin" status to feta cheese made in Greece, effectively restricting the use of the feta name to producers there.  From 2007 onwards, Greek firms will have the exclusive use of the feta label and producers elsewhere in Europe must find another name to describe their products. The German and Danish governments argue that feta does not relate to a specific geographical area and that their firms have