In [27]:
import json
import requests
import boto3
from datetime import datetime, timedelta

USERNAME = "etnav"
S3_WIKI_BUCKET = f"{USERNAME}-wikidata"

DATE_PARAM = "2023-11-24"

s3 = boto3.client("s3")


In [28]:
# Step 1: Parse the date parameter

target_date = datetime.strptime(DATE_PARAM, "%Y-%m-%d").date()

print("Target date:", target_date)


Target date: 2023-11-24


In [30]:
# Step 2: Build Wikimedia Top Pageviews API URL

url = (
    "https://wikimedia.org/api/rest_v1/metrics/pageviews/top/"
    f"en.wikipedia/all-access/"
    f"{target_date.year}/{target_date.month:02d}/{target_date.day:02d}"
)

print("API URL:", url)


API URL: https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/2023/11/24


In [31]:
headers = {
    "User-Agent": "ECBS5147-DataEngineering-Student (etnav@student.ceu.edu)"
}

response = requests.get(url, headers=headers)
response.raise_for_status()

data = response.json()



In [32]:
print("Keys in response:", data.keys())


Keys in response: dict_keys(['items'])


In [33]:
# Step 4: Transform API response into flat records

articles = data["items"][0]["articles"]

records = []
retrieved_at = datetime.utcnow().isoformat()

for article in articles:
    records.append(
        {
            "title": article["article"],
            "views": article["views"],
            "rank": article["rank"],
            "date": DATE_PARAM,
            "retrieved_at": retrieved_at,
        }
    )

print("Number of records:", len(records))
print("First record:", records[0])


Number of records: 1000
First record: {'title': 'Main_Page', 'views': 4342819, 'rank': 1, 'date': '2023-11-24', 'retrieved_at': '2025-12-23T20:10:01.650793'}


  retrieved_at = datetime.utcnow().isoformat()


In [34]:
# Step 5: Write JSON Lines file locally

local_file = f"raw-views-{DATE_PARAM}.json"

with open(local_file, "w") as f:
    for record in records:
        f.write(json.dumps(record) + "\n")

print("Wrote file:", local_file)


Wrote file: raw-views-2023-11-24.json


In [35]:
# Step 6: Upload file to S3

s3_key = f"raw-views/raw-views-{DATE_PARAM}.json"

s3.upload_file(
    local_file,
    S3_WIKI_BUCKET,
    s3_key,
)

print(f"Uploaded to s3://{S3_WIKI_BUCKET}/{s3_key}")


Uploaded to s3://etnav-wikidata/raw-views/raw-views-2023-11-24.json


In [36]:
# Test Lab: Verify file exists in S3

expected_key = f"raw-views/raw-views-{DATE_PARAM}.json"

try:
    s3.head_object(Bucket=S3_WIKI_BUCKET, Key=expected_key)
    print(f"File uploaded successfully to s3://{S3_WIKI_BUCKET}/{expected_key}")
except Exception as e:
    print(f"File not found at s3://{S3_WIKI_BUCKET}/{expected_key}")
    raise


File uploaded successfully to s3://etnav-wikidata/raw-views/raw-views-2023-11-24.json


In [37]:
print("Number of records:", len(records))


Number of records: 1000
