In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# ====== 配置 ======
input_csv = "all-data.csv"   # 改成你的路径
out_dir = "split_data"
test_size = 0.2              # 80/20
seed = 42

os.makedirs(out_dir, exist_ok=True)

# ====== 读取 ======
# 如果你的CSV确实没有表头，用 header=None 并指定列名
df = pd.read_csv(
input_csv,
header=None,
names=["label", "text"],
encoding="latin1"
)


# 清理：去掉空行、去掉前后空格
df["label"] = df["label"].astype(str).str.strip()
df["text"] = df["text"].astype(str).str.strip()
df = df[(df["label"] != "") & (df["text"] != "")].dropna().reset_index(drop=True)

# ====== 分层切分（保证三类在 train/test 比例接近） ======
train_df, test_df = train_test_split(
df,
test_size=test_size,
random_state=seed,
stratify=df["label"]
)

# 可选：打乱并重置索引
train_df = train_df.sample(frac=1, random_state=seed).reset_index(drop=True)
test_df  = test_df.sample(frac=1, random_state=seed).reset_index(drop=True)

# ====== 保存 ======
# 1) CSV（兼容性最好）
train_df.to_csv(os.path.join(out_dir, "train.csv"), index=False, encoding="utf-8")
test_df.to_csv(os.path.join(out_dir, "test.csv"), index=False, encoding="utf-8")

# 2) JSONL（NLP常用，一行一条样本，便于流式读取）
train_df.to_json(os.path.join(out_dir, "train.jsonl"), orient="records", lines=True, force_ascii=False)
test_df.to_json(os.path.join(out_dir, "test.jsonl"), orient="records", lines=True, force_ascii=False)

# 3) Parquet（推荐：更快、更小、带类型，huggingface/datasets 读很舒服）
train_df.to_parquet(os.path.join(out_dir, "train.parquet"), index=False)
test_df.to_parquet(os.path.join(out_dir, "test.parquet"), index=False)

print("Done!")
print("Total:", len(df))
print("Train:", len(train_df), " Test:", len(test_df))
print("Label dist (train):\n", train_df["label"].value_counts(normalize=True))
print("Label dist (test):\n", test_df["label"].value_counts(normalize=True))


Done!
Total: 4846
Train: 3876  Test: 970
Label dist (train):
 label
neutral     0.594169
positive    0.281218
negative    0.124613
Name: proportion, dtype: float64
Label dist (test):
 label
neutral     0.593814
positive    0.281443
negative    0.124742
Name: proportion, dtype: float64
