In [2]:
import pandas as pd
import json

DATA_DIR = "data/raw"

data_id = pd.read_csv(f"{DATA_DIR}/data_identification.csv")
emotion = pd.read_csv(f"{DATA_DIR}/emotion.csv")

print(data_id.head(), "\n")
print(emotion.head())


         id  split
0  0x61fc95   test
1  0x35663e  train
2  0xc78afe  train
3  0x90089c  train
4  0xaba820   test 

         id emotion
0  0x35663e     joy
1  0xc78afe    fear
2  0x90089c     joy
3  0x2ffb63     joy
4  0x989146     joy


In [3]:
with open(f"{DATA_DIR}/final_posts.json", "r") as f:
    raw_posts = json.load(f)

# Extract post_id and text from nested structure
posts = []
for item in raw_posts:
    post = item["root"]["_source"]["post"]
    posts.append({
        "id": post["post_id"],
        "text": post["text"]
    })

posts_df = pd.DataFrame(posts)
posts_df.head()


Unnamed: 0,id,text
0,0x61fc95,"We got the ranch, loaded our guns and sat up t..."
1,0x35663e,I bet there is an army of married couples who ...
2,0xc78afe,This could only end badly.
3,0x90089c,My sister squeezed a lime in her milk when she...
4,0xaba820,and that got my head bobbing a little bit.


In [4]:
# merge posts with train/test split
df = posts_df.merge(data_id, on="id", how="left")

# merge emotion labels (train rows only)
df = df.merge(emotion, on="id", how="left")

df.head()


Unnamed: 0,id,text,split,emotion
0,0x61fc95,"We got the ranch, loaded our guns and sat up t...",test,
1,0x35663e,I bet there is an army of married couples who ...,train,joy
2,0xc78afe,This could only end badly.,train,fear
3,0x90089c,My sister squeezed a lime in her milk when she...,train,joy
4,0xaba820,and that got my head bobbing a little bit.,test,


In [5]:
train_df = df[df["split"] == "train"].copy()
test_df  = df[df["split"] == "test"].copy()

# Check results
print("Train:", train_df.shape)
print("Test :", test_df.shape)

train_df.head()


Train: (47890, 4)
Test : (16281, 4)


Unnamed: 0,id,text,split,emotion
1,0x35663e,I bet there is an army of married couples who ...,train,joy
2,0xc78afe,This could only end badly.,train,fear
3,0x90089c,My sister squeezed a lime in her milk when she...,train,joy
7,0x2ffb63,Thank you so much❤️,train,joy
9,0x989146,Stinks because ive been in this program for a ...,train,joy


In [6]:
# Make sure processed directory exists
import os
os.makedirs("data/processed", exist_ok=True)

# Save train and test splits
train_df.to_csv("data/processed/train.csv", index=False)
test_df.to_csv("data/processed/test.csv", index=False)

print("Saved:")
print(" → data/processed/train.csv")
print(" → data/processed/test.csv")


Saved:
 → data/processed/train.csv
 → data/processed/test.csv
