In [5]:
import pandas as pd
from pathlib import Path

# 1. Combining labeled batches

In [15]:
# === Config ===
DATA_DIR = Path(".")  # change this if your CSVs are elsewhere

# === 1) Combine labels_batch1.csv to labels_batch11.csv ===
batches_1_11 = []
for i in range(1, 12):
    path = DATA_DIR / f"labels_batch{i}.csv"
    df = pd.read_csv(path)
    batches_1_11.append(df)

combined_1_11 = pd.concat(batches_1_11, ignore_index=True)
# Optional: save intermediate result if you want
# (uncomment the next line if you'd like a standalone file for 1–11)
# combined_1_11.to_csv(DATA_DIR / "labels_batches_01_11.csv", index=False)

# === 2) Sort labels_batch14.csv by review_id and overwrite it ===
path_14 = DATA_DIR / "labels_batch14.csv"
df14 = pd.read_csv(path_14)

# Create a numeric sort key when possible, fallback to string order.
# This handles review_id being either numeric or string without crashing.
df14["_sort_key"] = pd.to_numeric(df14["review_id"], errors="coerce")
df14 = df14.sort_values(by=["_sort_key", "review_id"], na_position="last").drop(columns=["_sort_key"])

# Overwrite the original labels_batch14.csv with the sorted version
df14.to_csv(path_14, index=False)

# === 3) Combine labels_batch12.csv, labels_batch13.csv, labels_batch14.csv with the previous combined dataframe ===
df12 = pd.read_csv(DATA_DIR / "labels_batch12.csv")
df13 = pd.read_csv(DATA_DIR / "labels_batch13.csv")
# Read the (now sorted) batch 14 again to be safe
df14_sorted = pd.read_csv(DATA_DIR / "labels_batch14.csv")
df_15 = pd.read_csv(DATA_DIR / "labels_batch15.csv")
df_16 = pd.read_csv(DATA_DIR / "labels_batch16.csv")
df_17 = pd.read_csv(DATA_DIR / "labels_batch17.csv")

final_combined = pd.concat([combined_1_11, df_15, df_16, df_17, df12, df13, df14_sorted], ignore_index=True)

# === Save final combined file ===
final_path = DATA_DIR / "labels_batches_01_17.csv"
final_combined.to_csv(final_path, index=False)

print(f"✅ Done. Final combined file saved to: {final_path}")

✅ Done. Final combined file saved to: labels_batches_01_17.csv


In [16]:
# Assuming you already have final_combined from before:
# Keep only rows where 'comprehensive_review' contains "[Review"
final_combined = final_combined[final_combined["comprehensive_review"].str.contains(r"\[Review", na=False)]

# Save again
final_path = "labels_batches_01_17_filtered.csv"
final_combined.to_csv(final_path, index=False)

print(f"✅ Filtered file saved to: {final_path}")

✅ Filtered file saved to: labels_batches_01_17_filtered.csv


# 2. Left join with full set of review predictors (all_combined_reviews.csv)

In [18]:
# === Load all_combined_reviews.csv and create 1-indexed review_id ===
all_reviews = pd.read_csv("./data/all_combined_reviews.csv")

# Ensure review_id is 1-indexed (start at 1, not 0)
all_reviews = all_reviews.reset_index(drop=True)
all_reviews["review_id"] = all_reviews.index + 1

# Save back if you want the updated file
all_reviews.to_csv("all_combined_reviews_with_id.csv", index=False)

# === Merge with final_combined on review_id ===
# Load final_combined if not in memory already
final_combined = pd.read_csv("labels_batches_01_17_filtered.csv")

# Ensure review_id is of the same type (int)
final_combined["review_id"] = final_combined["review_id"].astype(int)
all_reviews["review_id"] = all_reviews["review_id"].astype(int)

# Merge
merged = pd.merge(all_reviews, final_combined, on="review_id", how="inner")

# Save merged file
merged.to_csv("all_reviews_with_labels.csv", index=False)

print("✅ Done. Merged file saved to all_reviews_with_labels.csv")

✅ Done. Merged file saved to all_reviews_with_labels.csv
