In [2]:
import pandas as pd

data_split = pd.read_csv("data_identification.csv")
emotion = pd.read_csv("emotion.csv")
posts = pd.read_json("final_posts.json")

print("data_identification:", data_split.columns)
print("emotion:", emotion.columns)
print("final_posts:", posts.columns)


data_identification: Index(['id', 'split'], dtype='object')
emotion: Index(['id', 'emotion'], dtype='object')
final_posts: Index(['root'], dtype='object')


In [3]:
posts.head()

Unnamed: 0,root
0,"{'_type': 'post', '_source': {'post': {'post_i..."
1,"{'_type': 'post', '_source': {'post': {'post_i..."
2,"{'_type': 'post', '_source': {'post': {'post_i..."
3,"{'_type': 'post', '_source': {'post': {'post_i..."
4,"{'_type': 'post', '_source': {'post': {'post_i..."


In [4]:
import pandas as pd
import json

# 讀 data_identification.csv
data_split = pd.read_csv("data_identification.csv")     # id, split

# 讀 emotion.csv
emotion = pd.read_csv("emotion.csv")                    # id, emotion

# 用 read_json 讀，但我們不直接用 DataFrame 欄位，因為是 nested JSON
raw_json = pd.read_json("final_posts.json")     # 會有一欄 'root'

print("raw_json columns:", raw_json.columns)
print("one row sample:", raw_json.iloc[0]["root"])

# 從 nested 結構中取出 id & text
ids = []
texts = []

for item in raw_json["root"]:
    # 你截圖裡很明顯是 item["_source"]["post"][某欄位]
    post_obj = item["_source"]["post"]

    # 嘗試猜常見欄位名稱
    if "id" in post_obj:
        pid = post_obj["id"]
    elif "post_id" in post_obj:
        pid = post_obj["post_id"]
    elif "_id" in post_obj:
        pid = post_obj["_id"]
    else:
        raise ValueError("找不到 id 欄位，請貼出 print(item) 給我")

    if "text" in post_obj:
        ptext = post_obj["text"]
    elif "post" in post_obj:
        # 有些 dataset 用 "post" 放內容
        ptext = post_obj["post"]
    elif "content" in post_obj:
        ptext = post_obj["content"]
    else:
        raise ValueError("找不到 text 欄位，請貼出 print(item) 給我")

    ids.append(pid)
    texts.append(ptext)

# 建立 posts DataFrame
posts = pd.DataFrame({
    "id": ids,
    "text": texts
})

print("Extracted posts DataFrame:")
print(posts.head())

# ===== 合併 =====
df = data_split.merge(posts, on="id", how="left")
df = df.merge(emotion, on="id", how="left")   # test 的 emotion 是 NaN

train_df = df[df["split"] == "train"].reset_index(drop=True)
test_df  = df[df["split"] == "test"].reset_index(drop=True)

print("train_df", train_df.head())
print("test_df", test_df.head())


raw_json columns: Index(['root'], dtype='object')
one row sample: {'_type': 'post', '_source': {'post': {'post_id': '0x61fc95', 'text': 'We got the ranch, loaded our guns and sat up till sunrise.', 'hashtags': []}}}
Extracted posts DataFrame:
         id                                               text
0  0x61fc95  We got the ranch, loaded our guns and sat up t...
1  0x35663e  I bet there is an army of married couples who ...
2  0xc78afe                         This could only end badly.
3  0x90089c  My sister squeezed a lime in her milk when she...
4  0xaba820         and that got my head bobbing a little bit.
train_df          id  split                                               text emotion
0  0x35663e  train  I bet there is an army of married couples who ...     joy
1  0xc78afe  train                         This could only end badly.    fear
2  0x90089c  train  My sister squeezed a lime in her milk when she...     joy
3  0x2ffb63  train                                Thank yo

In [5]:
# 合併成一個大 df
df = data_split.merge(posts, on="id", how="left")
df = df.merge(emotion, on="id", how="left")   # test 沒有 emotion → NaN

train_df = df[df["split"] == "train"].reset_index(drop=True)
test_df  = df[df["split"] == "test"].reset_index(drop=True)

print(train_df.head())
print(test_df.head())


         id  split                                               text emotion
0  0x35663e  train  I bet there is an army of married couples who ...     joy
1  0xc78afe  train                         This could only end badly.    fear
2  0x90089c  train  My sister squeezed a lime in her milk when she...     joy
3  0x2ffb63  train                                Thank you so much❤️     joy
4  0x989146  train  Stinks because ive been in this program for a ...     joy
         id split                                               text emotion
0  0x61fc95  test  We got the ranch, loaded our guns and sat up t...     NaN
1  0xaba820  test         and that got my head bobbing a little bit.     NaN
2  0x66e44d  test                Same. Glad it's not just out store.     NaN
3  0xc03cf5  test  Like always i will wait and see thanks for the...     NaN
4  0x02f65a  test  There's a bit of room between "not loving sub-...     NaN


In [6]:
!pip install google-genai


