In [None]:
import pickle
from collections import Counter
from pathlib import Path

# 1. Define paths
pv_path = 'https://dumps.wikimedia.org/other/pageview_complete/monthly/2021/2021-08/pageviews-202108-user.bz2'
p = Path(pv_path)
pv_name = p.name
pv_temp = f'{p.stem}-4dedup.txt'
pv_clean = 'pageviews.pkl'  # We will save it as pageviews.pkl directly

# 2. Download the file (approx 2.3GB)
print("Downloading raw pageview data... (this may take 1-2 mins)")
!wget -N $pv_path

# 3. Process the file using shell commands
# This filters for "en.wikipedia", extracts the ID and Count, and saves to a temp text file
print("Filtering and processing data...")
!bzcat $pv_name | grep "^en\.wikipedia" | cut -d' ' -f3,5 | grep -P "^\d+\s\d+$" > $pv_temp

# 4. Read the text file and create the dictionary
print("Building the Python dictionary...")
wid2pv = Counter()
with open(pv_temp, 'rt') as f:
    for line in f:
        parts = line.split(' ')
        try:
            doc_id = int(parts[0])
            views = int(parts[1])
            wid2pv.update({doc_id: views})
        except ValueError:
            continue

# 5. Save the dictionary to a pickle file
print(f"Saving to {pv_clean}...")
with open(pv_clean, 'wb') as f:
    pickle.dump(wid2pv, f)

print(f"Done! Created {pv_clean} with {len(wid2pv)} entries.")

Downloading raw pageview data... (this may take 1-2 mins)
--2026-01-04 16:08:10--  https://dumps.wikimedia.org/other/pageview_complete/monthly/2021/2021-08/pageviews-202108-user.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.71, 2620:0:861:3:208:80:154:71
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.71|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2503235912 (2.3G) [application/octet-stream]
Saving to: ‘pageviews-202108-user.bz2’


2026-01-04 16:18:03 (4.03 MB/s) - ‘pageviews-202108-user.bz2’ saved [2503235912/2503235912]

Filtering and processing data...
Building the Python dictionary...
Saving to pageviews.pkl...
Done! Created pageviews.pkl with 10771932 entries.


In [None]:
#checking the first few entries
with open('pageviews.pkl', 'rb') as f:
    data = pickle.load(f)

# Print first 5 items
print("Sample pageviews:", list(data.items())[:5])
# Check a specific popular page if you know the ID (e.g., Hello Kitty 54295)
print("Views for ID 54295:", data.get(54295))

Sample pageviews: [(5878274, 52), (7712754, 28713), (3632887, 528), (600744, 4285), (59804426, 61650)]
Views for ID 54295: 36835


In [None]:
from google.colab import auth
from google.cloud import storage

# 1. Authenticate
auth.authenticate_user()

# 2. Setup
project_id = 'eyalinforetrievalproject' # <--- Don't forget to put your real Project ID here!
bucket_name = 'eyalir1'

client = storage.Client(project=project_id)
bucket = client.bucket(bucket_name)

print(f"Searching for files with 'pageview' in bucket '{bucket_name}'...")

# 3. List and Filter
blobs = client.list_blobs(bucket_name)
found_any = False

for blob in blobs:
    # Check if "pageview" exists in the name (case-insensitive)
    if "pageview" in blob.name.lower():
        print(f" - Found match: {blob.name}")
        found_any = True

if not found_any:
    print(" - No files containing 'pageview' were found.")

Searching for files with 'pageview' in bucket 'eyalir1'...
 - Found match: pageviews.pkl


In [None]:
from google.colab import auth
from google.cloud import storage

# 1. Authenticate
auth.authenticate_user()

# 2. Upload the file
project_id = 'eyalinforetrievalproject' #GCP Project ID
bucket_name = 'eyalir1'             # bucket name

client = storage.Client(project=project_id)
bucket = client.bucket(bucket_name)
blob = bucket.blob('pageviews.pkl')

print("Uploading pageviews.pkl to bucket...")
blob.upload_from_filename('pageviews.pkl')
print("Upload complete!")

Uploading pageviews.pkl to bucket...
Upload complete!
