In [2]:
import pandas as pd
from pathlib import Path

# --- configuration ---
site = "nytimes"       # change if needed
folder = Path("out_nytimes")     # directory containing your CSVs
fetch_file = folder / f"fetch_{site}.csv"
visit_file = folder / f"visit_{site}.csv"
urls_file  = folder / f"urls_{site}.csv"

# --- load ---
fetch = pd.read_csv(fetch_file)
visit = pd.read_csv(visit_file)
urls  = pd.read_csv(urls_file)

# --- fetch statistics ---
fetch_attempted = len(fetch)
fetch_succeeded = len(fetch[fetch["Status"].between(200, 299)])
fetch_failed    = fetch_attempted - fetch_succeeded

status_counts = fetch["Status"].value_counts().sort_index()

# --- outgoing URLs ---
total_urls   = len(urls)                                  # includes duplicates
unique_total = urls["URL"].nunique()                      # distinct URLs
unique_in    = urls.loc[urls["Indicator"] == "OK", "URL"].nunique()
unique_out   = urls.loc[urls["Indicator"] == "N_OK", "URL"].nunique()

# --- file size buckets ---
sizes = visit["Size"]
size_lt_1kb      = (sizes < 1024).sum()
size_1_to_10kb   = ((sizes >= 1024) & (sizes < 10*1024)).sum()
size_10_to_100kb = ((sizes >= 10*1024) & (sizes < 100*1024)).sum()
size_100kb_to_1mb= ((sizes >= 100*1024) & (sizes < 1024*1024)).sum()
size_ge_1mb      = (sizes >= 1024*1024).sum()

# --- content types ---
content_types = visit["Content-Type"].value_counts().sort_index()

# --- build report text ---
lines = []
lines.append(f"Name: Tianyu Zhang")
lines.append("USC ID: XXXXXXXX")
lines.append(f"News site crawled: {site}")
lines.append("")
lines.append("Fetch Statistics")
lines.append("=================")
lines.append(f"# fetches attempted: {fetch_attempted}")
lines.append(f"# fetches succeeded: {fetch_succeeded}")
lines.append(f"# fetches failed or aborted: {fetch_failed}")
lines.append("")
lines.append("Outgoing URLs")
lines.append("==============")
lines.append(f"Total URLs extracted: {total_urls}")
lines.append(f"# unique URLs extracted: {unique_total}")
lines.append(f"# unique URLs within News Site: {unique_in}")
lines.append(f"# unique URLs outside News Site: {unique_out}")
lines.append("")
lines.append("Status Codes")
lines.append("============")
for code, count in status_counts.items():
    lines.append(f"{code}: {count}")
lines.append("")
lines.append("File Sizes")
lines.append("==========")
lines.append(f"< 1KB: {size_lt_1kb}")
lines.append(f"1KB ~ <10KB: {size_1_to_10kb}")
lines.append(f"10KB ~ <100KB: {size_10_to_100kb}")
lines.append(f"100KB ~ <1MB: {size_100kb_to_1mb}")
lines.append(f">= 1MB: {size_ge_1mb}")
lines.append("")
lines.append("Content Types")
lines.append("==============")
for ctype, count in content_types.items():
    lines.append(f"{ctype}: {count}")

# --- save ---
report_path = folder / f"CrawlReport_{site}.txt"
report_path.write_text("\n".join(lines), encoding="utf-8")
print(f"✅ Report written to: {report_path.resolve()}")


✅ Report written to: C:\Users\Tianyu Zhang\Desktop\USC_assignment\2025_F\572\HW2\out_nytimes\CrawlReport_nytimes.txt
