In [1]:
from google.colab import auth
auth.authenticate_user()

project_id = 'eyalinforetrievalproject' #GCP Project ID
!gcloud config set project {project_id}


!gsutil cp gs://eyalir1/inverted_index_gcp.py . # downloading the inverted_index_gcp.py


!mkdir -p postings_gcp_body # creating the directory locally


print("Downloading posting files")
!gsutil -m cp -r gs://eyalir1/postings_gcp_body/* postings_gcp_body/ # downloading the index and all .bin files

print("Download complete.")

Updated property [core/project].


To take a quick anonymous survey, run:
  $ gcloud survey

Copying gs://eyalir1/inverted_index_gcp.py...
/ [1 files][  7.9 KiB/  7.9 KiB]                                                
Operation completed over 1 objects/7.9 KiB.                                      
Downloading posting files... this may take 2-3 minutes...
Copying gs://eyalir1/postings_gcp_body/0_000.bin...
Copying gs://eyalir1/postings_gcp_body/0_002.bin...
Copying gs://eyalir1/postings_gcp_body/0_004.bin...
Copying gs://eyalir1/postings_gcp_body/0_001.bin...
Copying gs://eyalir1/postings_gcp_body/0_003.bin...
Copying gs://eyalir1/postings_gcp_body/0_007.bin...
Copying gs://eyalir1/postings_gcp_body/0_006.bin...
Copying gs://eyalir1/postings_gcp_body/0_005.bin...
Copying gs://eyalir1/postings_gcp_body/0_008.bin...
Copying gs://eyalir1/postings_gcp_body/0_009.bin...
Copying gs://eyalir1/postings_gcp_body/0_010.bin...
Copying gs://eyalir1/postings_gcp_body/0_011.bin...
Copying gs://eya

In [3]:
import sys
from collections import defaultdict
import math
import pickle
from inverted_index_gcp import InvertedIndex


print("Loading index") # loading your existing index (Read-Only)
inverted_body = InvertedIndex.read_index('postings_gcp_body', 'index_body')
N = 6348910 #the corpus size from running the inverted indexes


doc_norms = defaultdict(float) # preparing the Dictionary. this will hold {doc_id: norm_value}


print("Calculating norms") # iterating and calculating
for term, posting_list in inverted_body.posting_lists_iter('.'): #going over all in postings_gcp_body/
    df = inverted_body.df[term]
    idf = math.log(N / df, 10)
    idf_squared = idf ** 2

    for doc_id, tf in posting_list:
        doc_norms[doc_id] += (tf**2) * idf_squared #sum of squared weights

for doc_id in doc_norms: # calculating square root
    doc_norms[doc_id] = math.sqrt(doc_norms[doc_id])

print(f"Calculated norms for {len(doc_norms)} documents.")


with open('norms.pkl', 'wb') as f: # saving the norms to a pickle file
    pickle.dump(dict(doc_norms), f)

print("Success! Created norms.pkl")

Loading index...
Calculating norms...
Calculated norms for 6347136 documents.
Success! Created norms.pkl


In [4]:
!gsutil cp norms.pkl gs://eyalir1/ #uploading to the bucket

Copying file://norms.pkl [Content-Type=application/octet-stream]...
|
Operation completed over 1 objects/84.7 MiB.                                     
