In [1]:
input_path = "./data/sample_input.json"
outdir = "./data"
seed = 42

import os
os.makedirs(outdir, exist_ok=True)


In [2]:
import json, csv, random, os
from collections import OrderedDict, defaultdict

# 경로 설정 (앞에서 input_path, outdir 지정했다고 가정)
os.makedirs(outdir, exist_ok=True)
if seed is not None:
    random.seed(seed)

KEYS = ["location","decibel","goal","playlist","mood","bpm","energy","vocal","genre"]

def rec(location, decibel, goal, playlist, mood, bpm, energy, vocal, genre):
    d = OrderedDict()
    for k, v in zip(KEYS, [location, decibel, goal, playlist, mood, bpm, energy, vocal, genre]):
        d[k] = v
    return d

with open(input_path, "r", encoding="utf-8") as f:
    raw = json.load(f)

records = []
for r in raw:
    row = OrderedDict()
    for k in KEYS:
        row[k] = r.get(k, None)
    records.append(row)

edge_cases = [
    rec("library", 30, "focus", "Soft Jazz Study", "calm", 74, 0.40, "instrumental", "jazz"),
    rec("home", 30, "sleep", "Rain On Window", "soothing", 55, 0.20, "instrumental", "rain"),
    rec("subway", 95, "active", "Electronic Focus Boost", "energetic", 120, 0.80, "with_vocal", "electronic"),
    rec("outdoor", 95, "relax", "Brown Noise Mask", "noise-masking", 55, 0.35, "instrumental", "brown_noise"),
]
records.extend(edge_cases)

seed_playlists = {
    ("library","focus"): [
        ("Soft Jazz Study","calm", 76, 0.40, "instrumental","jazz"),
        ("Classical Focus","calm", 72, 0.38, "instrumental","classical"),
    ],
    ("library","reading"): [
        ("Gentle Piano Reading","soft", 58, 0.30, "instrumental","piano"),
        ("Acoustic Study","soft", 62, 0.34, "with_vocal","acoustic")
    ],
    ("cafe","focus"): [
        ("Deep Work Lo-Fi","calm", 82, 0.47, "instrumental","lofi"),
        ("Minimal Tech Focus","neutral", 90, 0.55, "instrumental","minimal_techno")
    ],
    ("cafe","relax"): [
        ("Cafe Ambient","chill", 66, 0.36, "instrumental","ambient"),
        ("Smooth Bossa Break","chill", 92, 0.50, "with_vocal","bossa_nova")
    ],
    ("office","active"): [
        ("Flowstate EDM","upbeat", 112, 0.68, "with_vocal","electronic"),
        ("Productive Pop","energetic", 118, 0.75, "with_vocal","pop")
    ],
    ("home","sleep"): [
        ("White Noise Night","soothing", 55, 0.20, "instrumental","white_noise"),
        ("Rain On Window","soothing", 55, 0.22, "instrumental","rain")
    ],
    ("subway","meditate"): [
        ("Brown Noise Mask","noise-masking", 55, 0.30, "instrumental","brown_noise"),
        ("Airplane Cabin","noise-masking", 55, 0.32, "instrumental","pink_noise")
    ],
    ("outdoor","relax"): [
        ("Nature Calm","calm", 60, 0.30, "instrumental","nature"),
        ("Acoustic Chill","chill", 72, 0.42, "with_vocal","acoustic")
    ],
}

decibel_by_loc = {
    "library": [32, 45],
    "cafe": [60, 74],
    "office": [50, 63],
    "home": [42, 56],
    "subway": [81, 90],
    "outdoor": [58, 72]
}

for (loc, goal), plist in seed_playlists.items():
    for i, (playlist, mood, bpm, energy, vocal, genre) in enumerate(plist):
        dB = decibel_by_loc.get(loc, [60, 70])[i if i < len(decibel_by_loc.get(loc, [60,70])) else -1]
        records.append(rec(loc, dB, goal, playlist, mood, bpm, energy, vocal, genre))

fallback_playlist = [
    ("Neutral Soundscape","neutral", 62, 0.33, "instrumental","ambient"),
    ("Ambient Essentials","neutral", 60, 0.32, "instrumental","ambient")
]
locations = ["cafe","library","subway","office","outdoor","home"]
for loc in locations:
    for decibel in [40, 65, 85]:
        pl, mood, bpm, energy, vocal, genre = random.choice(fallback_playlist)
        records.append(rec(loc, decibel, "neutral", pl, mood, bpm, energy, vocal, genre))

seen = set()
unique_records = []
for r in records:
    key = tuple(r[k] for k in KEYS)
    if key not in seen:
        seen.add(key)
        unique_records.append(r)

order_loc = {v:i for i,v in enumerate(["cafe","library","subway","office","outdoor","home"])}
order_goal = {v:i for i,v in enumerate(["focus","relax","meditate","sleep","active","reading","neutral"])}

unique_records.sort(key=lambda x: (
    order_loc.get(x["location"], 999),
    order_goal.get(x["goal"], 999),
    int(x["decibel"])
))

out_json = os.path.join(outdir, "sample_input_v2.json")
out_csv  = os.path.join(outdir, "sample_input_v2.csv")

with open(out_json, "w", encoding="utf-8") as f:
    json.dump(unique_records, f, ensure_ascii=False, indent=2)

with open(out_csv, "w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=KEYS)
    writer.writeheader()
    writer.writerows(unique_records)

print(f"[완료] 레코드 수: {len(unique_records)}")


[완료] 레코드 수: 88


In [1]:
!pip install pandas





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
df = pd.read_csv("./data/sample_input_v2.csv")
df.head()


Unnamed: 0,location,decibel,goal,playlist,mood,bpm,energy,vocal,genre
0,cafe,60,focus,Deep Work Lo-Fi,calm,82,0.47,instrumental,lofi
1,cafe,64,focus,Lo-Fi Beats,calm,80,0.45,instrumental,lofi
2,cafe,74,focus,Minimal Tech Focus,neutral,90,0.55,instrumental,minimal_techno
3,cafe,60,relax,Cafe Ambient,chill,66,0.36,instrumental,ambient
4,cafe,61,relax,Ambient Journey,chill,65,0.35,instrumental,ambient
