In [3]:
import pandas as pd
import json
import pathlib


def to_chatml(row):
    return {
        "messages": [
            {"role": "system",
             "content": "You are an expert fortune-telling advisor."},
            {"role": "user", "content": row["question"]},
            {"role": "assistant", "content": row["answer"]}
        ]
    }


train = pd.read_csv("data/fortune_train.csv")
val = pd.read_csv("data/fortune_test.csv")

pathlib.Path("data").mkdir(exist_ok=True)

(train.apply(to_chatml, axis=1)
      .to_json("fortune_train.jsonl", orient="records", lines=True, force_ascii=False))
(val  .apply(to_chatml, axis=1)
      .to_json("fortune_val.jsonl",   orient="records", lines=True, force_ascii=False))

print("✅ JSONL files saved. Ready for upload.")

✅ JSONL files saved. Ready for upload.


In [2]:
train.shape, val.shape

((165, 3), (42, 3))

# Format the dataset 

In [3]:
import pandas as pd
import json

def format_for_training(df):
    """
    Format dataset into OpenAI's ChatCompletion format.
    Returns a list of conversation dictionaries.
    """
    formatted_data = []
    
    for _, row in df.iterrows():
        # Create a conversation instance
        conversation = [
            {
                "role": "system",
                "content": "You are an expert voice and performance analyst. Your role is to provide detailed voice descriptions for various types of content, including movies, commercials, news broadcasts, narratives, and other media. you only return with one valid json attribute(voice_description), which describes the appropriate voice characteristics, emotional tone, pacing, and delivery style that would effectively convey the intended message and impact in 20 words max."
            },
            {
                "role": "user",
                "content": f"""Content: "{row['raw_text']}"

Please provide a detailed voice description (20 words max) for delivering this content."""
            },
            {
                "role": "assistant",
                "content": json.loads(row['rewrite_response'])['voice_description']
            }
        ]
        formatted_data.append(conversation)
    
    return formatted_data

# Train Validation Split


In [4]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

# Read the CSV file
df = pd.read_csv('../synthsized_dataset/combined_dataset.csv')

# First, stratify the data by type
train_df, test_df = train_test_split(
    df,
    test_size=0.3,  # 30% for test set
    stratify=df['type'],
    random_state=42  # for reproducibility
)

# Format the train data
train_formatted = format_for_training(train_df)

# Format the test data
test_formatted = format_for_training(test_df)

# Save train data to JSONL
with open('data/voice_train.jsonl', 'w') as f:
    for conversation in train_formatted:
        f.write(json.dumps({"messages": conversation}) + '\n')

# Save test data to JSONL
with open('data/voice_test.jsonl', 'w') as f:
    for conversation in test_formatted:
        f.write(json.dumps({"messages": conversation}) + '\n')

# Print distribution statistics
print("\nData Distribution:")
print(f"Total samples: {len(df)}")
print(f"Training samples: {len(train_df)} ({len(train_df)/len(df)*100:.1f}%)")
print(f"Test samples: {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)")
print("\nType distribution in original data:")
print(df['type'].value_counts(normalize=True))
print("\nType distribution in training set:")
print(train_df['type'].value_counts(normalize=True))
print("\nType distribution in test set:")
print(test_df['type'].value_counts(normalize=True))


Data Distribution:
Total samples: 400
Training samples: 280 (70.0%)
Test samples: 120 (30.0%)

Type distribution in original data:
type
commercial    0.25
horror        0.25
movie         0.25
news          0.25
Name: proportion, dtype: float64

Type distribution in training set:
type
commercial    0.25
news          0.25
horror        0.25
movie         0.25
Name: proportion, dtype: float64

Type distribution in test set:
type
news          0.25
horror        0.25
commercial    0.25
movie         0.25
Name: proportion, dtype: float64
