In [9]:
import pandas as pd

# Load CSV
df = pd.read_csv("votes_clean.csv")
# Get unique study_question values
unique_questions = df["study_question"].unique()

# Print them
print("🧠 Unique study_question values:")
for q in unique_questions:
    print(f"- {q}")

🧠 Unique study_question values:
- wealthier
- more beautiful
- livelier
- more depressing
- safer
- more boring
- nan


In [5]:
import pandas as pd

# Load CSV
df = pd.read_csv("votes_clean.csv")

# Create unordered pair key
df["pair_key"] = df.apply(lambda row: frozenset([str(row["left"]), str(row["right"])]), axis=1)

# Count occurrences
pair_counts = df["pair_key"].value_counts()
repeating_pairs = pair_counts[pair_counts > 1]

# Filter repeated rows
duplicates_df = df[df["pair_key"].isin(repeating_pairs.index)].copy()

# Normalize left/right
duplicates_df[["left_sorted", "right_sorted"]] = duplicates_df.apply(
    lambda row: pd.Series(sorted([str(row["left"]), str(row["right"])])), axis=1
)

# Group and summarize
def summarize_group(group):
    return pd.Series({
        "left": (group["choice"] == "left").sum(),
        "right": (group["choice"] == "right").sum(),
        "equal": (group["choice"] == "equal").sum(),
        "total": len(group),
        "study_questions": list(group["study_question"].unique())
    })

summary_df = (
    duplicates_df
    .groupby(["left_sorted", "right_sorted"])
    .apply(summarize_group)
    .reset_index()
    .sort_values("total", ascending=False)
)

# Save to CSV
summary_df.to_csv("repeated_pairs_vote_summary.csv", index=False)
print("✅ Updated summary with 'equal' saved to 'repeated_pairs_vote_summary.csv'")

✅ Updated summary with 'equal' saved to 'repeated_pairs_vote_summary.csv'


  .apply(summarize_group)


In [6]:
import pandas as pd

# Load CSV
df = pd.read_csv("votes_clean.csv")

# Remove rows where choice is 'equal'
df = df[df["choice"].str.lower().str.strip() != "equal"].copy()

# Normalize left/right (unordered)
df[["left_sorted", "right_sorted"]] = df.apply(
    lambda row: pd.Series(sorted([str(row["left"]), str(row["right"])])), axis=1
)

# Group by (left, right, study_question)
grouped = (
    df.groupby(["left_sorted", "right_sorted", "study_question"])
    .agg(
        left_num=("choice", lambda x: (x == "left").sum()),
        right_num=("choice", lambda x: (x == "right").sum()),
        total=("choice", "count")
    )
    .reset_index()
)

# Keep only duplicated (total > 1)
duplicates_only = grouped[grouped["total"] > 1].copy()

# Rename columns for output
duplicates_only.rename(columns={
    "left_sorted": "left_image_ID",
    "right_sorted": "right_image_ID"
}, inplace=True)

# Sort by total
duplicates_only = duplicates_only.sort_values("total", ascending=False)

# Save
duplicates_only.to_csv("repeated_pair_summary_filtered.csv", index=False)
print("✅ Saved duplicates only to 'repeated_pair_summary_filtered.csv'")


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [8]:
import pandas as pd

# Load CSV
df = pd.read_csv("votes_clean.csv")

# Normalize for unordered pair comparison
target_ids = {"513e5dc3fdc9f0358700aeab", "5140d960fdc9f04926003bb4"}
target_question = "livelier"

filtered_df = df[
    (df["study_question"].str.strip().str.lower() == target_question.lower()) &
    (df.apply(lambda row: {str(row["left"]), str(row["right"])} == target_ids, axis=1))
]


# Show results
print(filtered_df)

# Optional: save to file
filtered_df.to_csv("filtered_livelier_pair.csv", index=False)

        Unnamed: 0             place_id_left            place_id_right  \
322589      322590  50e6fcabd7c3df413b000aa6  50e74a36d7c3df413b001488   
323306      323307  50e6fcabd7c3df413b000aa6  50e74a36d7c3df413b001488   
323401      323402  50e6fcabd7c3df413b000aa6  50e74a36d7c3df413b001488   
323403      323404  50e6fcabd7c3df413b000aa6  50e74a36d7c3df413b001488   
323437      323438  50e6fcabd7c3df413b000aa6  50e74a36d7c3df413b001488   
...            ...                       ...                       ...   
355356      355357  50e6fcabd7c3df413b000aa6  50e74a36d7c3df413b001488   
355480      355481  50e6fcabd7c3df413b000aa6  50e74a36d7c3df413b001488   
356011      356012  50e6fcabd7c3df413b000aa6  50e74a36d7c3df413b001488   
356344      356345  50e6fcabd7c3df413b000aa6  50e74a36d7c3df413b001488   
356744      356745  50e6fcabd7c3df413b000aa6  50e74a36d7c3df413b001488   

                        study_id                      left  \
322589  50f62c41a84ea7c5fdd2e454  513e5dc3fdc9f03

In [10]:
import pandas as pd

# Load file
df = pd.read_csv("repeated_pair_summary_filtered.csv")

# Remove ties
df = df[df["left_num"] != df["right_num"]].copy()

# Recalculate total
df["total"] = df["left_num"] + df["right_num"]

# Save to new file
df.to_csv("repeated_pair_summary_filtered_no_ties.csv", index=False)
print("✅ Saved to 'repeated_pair_summary_filtered_no_ties.csv' with updated totals")


✅ Saved to 'repeated_pair_summary_filtered_no_ties.csv' with updated totals


In [18]:
import pandas as pd

# Load data
summary_df = pd.read_csv("repeated_pair_summary_filtered_no_ties.csv")
votes_df = pd.read_csv("votes_clean.csv")

# Normalize and prep votes_clean
votes_df["left"] = votes_df["left"].astype(str)
votes_df["right"] = votes_df["right"].astype(str)
votes_df["choice"] = votes_df["choice"].str.strip().str.lower()
votes_df["study_question"] = votes_df["study_question"].str.strip().str.lower()
votes_df["pair_key"] = votes_df.apply(lambda r: ",".join(sorted([r["left"], r["right"]])), axis=1)

# Normalize summary_df
summary_df["left_image_ID"] = summary_df["left_image_ID"].astype(str)
summary_df["right_image_ID"] = summary_df["right_image_ID"].astype(str)
summary_df["study_question"] = summary_df["study_question"].str.strip().str.lower()
summary_df["pair_key"] = summary_df.apply(
    lambda r: ",".join(sorted([r["left_image_ID"], r["right_image_ID"]])), axis=1
)
summary_df["choice"] = summary_df.apply(
    lambda r: "left" if r["left_num"] > r["right_num"] else "right", axis=1
)

# Merge and keep left_num, right_num
merged = pd.merge(
    summary_df[["pair_key", "study_question", "choice", "left_num", "right_num"]],
    votes_df,
    how="left",
    on=["pair_key", "study_question", "choice"]
)

# Drop duplicates by pair_key + study_question
merged = merged.drop_duplicates(subset=["pair_key", "study_question"])

# Save all original votes_clean columns + left_num and right_num
final_cols = list(votes_df.columns) + ["left_num", "right_num"]
final_df = merged[final_cols]

final_df.to_csv("representative_votes_clean_format.csv", index=False)
print("✅ Saved with left_num/right_num to 'representative_votes_clean_format.csv'")


✅ Saved with left_num/right_num to 'representative_votes_clean_format.csv'
