# 从 huggingface 下载 ag_news 数据集

In [1]:
# 使用代理，连接 huggingface
%env ALL_PROXY=http://127.0.0.1:7890
%env HTTP_PROXY=http://127.0.0.1:7890
%env HTTPS_PROXY=http://127.0.0.1:7890

env: ALL_PROXY=http://127.0.0.1:7890
env: HTTP_PROXY=http://127.0.0.1:7890
env: HTTPS_PROXY=http://127.0.0.1:7890


In [2]:
import os
import pandas as pd

# 定义数据集的分片文件路径
splits = {'train': 'data/train-00000-of-00001.parquet', 
          'test': 'data/test-00000-of-00001.parquet'}

# 目标存储目录
output_dir = "../../data/ag_news/raw/"
if not os.path.exists(output_dir):
    # 如果目录不存在，则创建目录
    os.makedirs(output_dir, exist_ok=True)

# 加载数据
train_df = pd.read_parquet("hf://datasets/fancyzhx/ag_news/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/fancyzhx/ag_news/" + splits["test"])

# 保存数据到指定路径
train_df.to_parquet(output_dir + "train-00000-of-00001.parquet")
test_df.to_parquet(output_dir + "test-00000-of-00001.parquet")

train_df, test_df


(                                                     text  label
 0       Wall St. Bears Claw Back Into the Black (Reute...      2
 1       Carlyle Looks Toward Commercial Aerospace (Reu...      2
 2       Oil and Economy Cloud Stocks' Outlook (Reuters...      2
 3       Iraq Halts Oil Exports from Main Southern Pipe...      2
 4       Oil prices soar to all-time record, posing new...      2
 ...                                                   ...    ...
 119995  Pakistan's Musharraf Says Won't Quit as Army C...      0
 119996  Renteria signing a top-shelf deal Red Sox gene...      1
 119997  Saban not going to Dolphins yet The Miami Dolp...      1
 119998  Today's NFL games PITTSBURGH at NY GIANTS Time...      1
 119999  Nets get Carter from Raptors INDIANAPOLIS -- A...      1
 
 [120000 rows x 2 columns],
                                                    text  label
 0     Fears for T N pension after talks Unions repre...      2
 1     The Race is On: Second Private Team Sets La

# 预处理 ag_news 数据集

In [3]:
import pandas as pd

full_train_df = pd.read_parquet("../../data/ag_news/raw/train-00000-of-00001.parquet")
full_train_df

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2
...,...,...
119995,Pakistan's Musharraf Says Won't Quit as Army C...,0
119996,Renteria signing a top-shelf deal Red Sox gene...,1
119997,Saban not going to Dolphins yet The Miami Dolp...,1
119998,Today's NFL games PITTSBURGH at NY GIANTS Time...,1


In [4]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    full_train_df, 
    test_size=0.2, 
    random_state=42,
    stratify=full_train_df["label"],
    # shuffle=True,
    )

In [6]:
# 目标存储目录
output_dir = "../../data/ag_news/processed/"
if not os.path.exists(output_dir):
    # 如果目录不存在，则创建目录
    os.makedirs(output_dir, exist_ok=True)

train_df.to_parquet("../../data/ag_news/processed/train.parquet")
val_df.to_parquet("../../data/ag_news/processed/val.parquet")

In [7]:
train_df

Unnamed: 0,text,label
51238,"Clijsters Unsure About Latest Injury, Says Hew...",1
32395,Earthquake Swarm Shakes Eastern Sierra (AP) AP...,3
54318,Issues related to war in Iraq help define elec...,0
108419,NCAA probation for little engine that could RO...,1
419,eBay Buys 25 of CraigList eBay Buys 25 of crai...,3
...,...,...
19819,Vote keeps Lebanon on road to Damascus INNER B...,0
1265,"Celtics Sign Gugliotta, Wait on Payton (AP) AP...",1
2851,Compuware Asks Court To Sanction IBM In Softwa...,3
7479,Toys 'R' Us Swings to 2Q Profit (AP) AP - Trou...,2


In [8]:
val_df

Unnamed: 0,text,label
112486,Wall Street's Designs on '05? A Merger Boom Af...,2
109163,Dodgers sign Ledee to two-year deal The Los An...,1
6638,Olympians pursuit of marketing gold begins lon...,2
3996,New Device: Flying Robot Seiko Epson hopes the...,2
39522,Spanish laureate 'was informer' Prize-winning ...,0
...,...,...
105468,"Let the Bidding Begin Wasting little time, Oma...",1
90241,Babson finishes strong to reach NCAA tourney F...,1
17201,Casagrande and Golbano unfit for Vuelta Italia...,1
70437,Gordon repaved Martinsville poses new challen...,1


In [9]:
test_df = pd.read_parquet("../../data/ag_news/raw/test-00000-of-00001.parquet")
test_df

Unnamed: 0,text,label
0,Fears for T N pension after talks Unions repre...,2
1,The Race is On: Second Private Team Sets Launc...,3
2,Ky. Company Wins Grant to Study Peptides (AP) ...,3
3,Prediction Unit Helps Forecast Wildfires (AP) ...,3
4,Calif. Aims to Limit Farm-Related Smog (AP) AP...,3
...,...,...
7595,Around the world Ukrainian presidential candid...,0
7596,Void is filled with Clement With the supply of...,1
7597,Martinez leaves bitter Like Roger Clemens did ...,1
7598,5 of arthritis patients in Singapore take Bext...,2


In [10]:
test_df.to_parquet("../../data/ag_news/processed/test.parquet")