In [1]:
import requests
import json
from IPython.display import Markdown, display
import pandas as pd
import numpy as np
import re


# Prepare Data set

**Steps:**
*   Call Wikipedia API to fetch the data (2025 Women's Cricket Woldcup)
*   Extract the data from the response
*   Store it into Data Frame





In [2]:
def fetch_wikipedia_extract(title):
    API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'format': 'json',
        'titles': title,
        'prop': 'extracts',
        'explaintext': 1,
        'redirects': 1
    }
    headers = {
        'User-Agent': 'ColabNotebook/1.0 (your_email@example.com)'
    }

    print(f"Attempting to fetch data for: '{title}'")

    try:
        response = requests.get(API_ENDPOINT, params=params, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        return f"Error fetching data: Request Exception: {e}"
    try:
        data = response.json()
        pages = data.get('query', {}).get('pages', {})
        if not pages:
            return "Error: 'pages' section missing in API response."

        page_id = next(iter(pages))
        page = pages[page_id]

        if page_id == '-1':
            return f"Error: Article not found for title '{title}'."

        if 'extract' in page and page['extract']:
            return page['extract']
        else:
            return f"Error: Article found, but no extract content available."
    except Exception as e:
        return f"Error during data processing: {e}"

In [3]:
def parse_extract_to_dataframe(text):
    normalized_text = re.sub(r'\n\s*\n', '\n\n', text).strip()
    sections = re.split(r'\n\n(.*?)\n{1,2}', normalized_text, maxsplit=0, flags=re.DOTALL)

    data_rows = []

    # 1. Handle the Introduction (always the first element)
    intro_content = sections[0].strip()
    if intro_content:
        data_rows.append({'Section Title': 'Introduction', 'Content': intro_content})

    # 2. Handle subsequent sections (Title, Content pairs)
    for i in range(1, len(sections) - 1, 2):
        title = sections[i].strip()
        content = sections[i+1].strip()

        if title and content:
            # Use the first line of the captured title as the official section title
            cleaned_title = title.split('\n')[0].strip()
            data_rows.append({'Section Title': cleaned_title, 'Content': content})

    # If parsing failed, return the whole content as one row
    if not data_rows and text.strip():
        return pd.DataFrame([{'Section Title': 'Full Content (Unparsed)', 'Content': text.strip()}])

    return pd.DataFrame(data_rows)

In [4]:
article_title = "2025 Women's Cricket World Cup"
result = fetch_wikipedia_extract(article_title)
if result.startswith("Error"):
  print(f"STATUS: FAILED")
  print(result)
else:
  print(f"STATUS: SUCCESS")
  wiki_data = parse_extract_to_dataframe(result)

Attempting to fetch data for: '2025 Women's Cricket World Cup'
STATUS: SUCCESS


In [5]:
wiki_data.head()

Unnamed: 0,Section Title,Content
0,Introduction,The 2025 ICC Women's Cricket World Cup was the...
1,== Background ==,=== Neutral venue arrangements ===\nOn 19 Dece...
2,=== Marketing ===,"Before the commencement of the tournament, the..."
3,=== Opening ceremony ===,The opening ceremony was held in Assam Cricket...
4,== Qualification ==,"The West Indies, semi-finalists at the precedi..."


# RAG using Open AI

**Steps**


*   Prerequsite - Data, Open AI key.
*   Embed the data (based on open AI embedding)
*   Calculate the distance between question and the embedded data using cosine similarity.
*   Based on the distance, take n data as context
*   Create a prompt - Prompt should have context and question
*   Call OpenAI API







In [6]:
!pip install openai



In [7]:
import openai

In [8]:
#Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# Read Open AI key
with open('/content/drive/MyDrive/Secrets/openai_api_key.txt', "r") as f:
  openai.api_key = ' '.join(f.readlines())

#Choose GPT moodel
gpt_model='gpt-4o-mini'
embedding_model = 'text-embedding-3-small'

In [10]:
def call_llm(prompt):
  try:
    response = openai.chat.completions.create(model=gpt_model,
                                              messages = [
                                                  {"role": "system", "content": "You are an expert assistant"},
                                                  {"role": "assistant", "content": "You read the input and answer to the question."},
                                                  {"role": "user", "content": prompt}
                                              ],
                                              temperature=0.1
                                              )
    return response.choices[0].message.content.strip()
  except Exception as e:
    return str(e)

In [11]:
query = 'Who won the ICC Women\'s World Cup 2025?'

Just calling the OpenAI API yields an irrelevant answer because the GPT-4o-mini model I used was released before 2025, and therefore only provides the **most probable answer, not the relevant or latest data**.

In [12]:
display(Markdown(call_llm(query)))

As of my last knowledge update in October 2023, the ICC Women's World Cup 2025 has not yet taken place, so there is no winner to report. The tournament is scheduled to be held in 2025. Please check the latest sources for updates on the event.

## Embeddings

In [13]:
def get_embedding(text, model=embedding_model):
   text = text.replace("\n", " ")
   return openai.embeddings.create(input = [text], model=model).data[0].embedding

In [14]:
df_openai = wiki_data.copy()
df_openai['embedded_text'] = df_openai['Content'].apply(get_embedding)

In [15]:
df_openai.head()

Unnamed: 0,Section Title,Content,embedded_text
0,Introduction,The 2025 ICC Women's Cricket World Cup was the...,"[0.012802551500499249, 0.018018405884504318, 0..."
1,== Background ==,=== Neutral venue arrangements ===\nOn 19 Dece...,"[-0.026573259383440018, -0.03169959783554077, ..."
2,=== Marketing ===,"Before the commencement of the tournament, the...","[0.03740307688713074, 0.011801231652498245, 0...."
3,=== Opening ceremony ===,The opening ceremony was held in Assam Cricket...,"[0.0656605139374733, -0.01241258718073368, 0.0..."
4,== Qualification ==,"The West Indies, semi-finalists at the precedi...","[-0.017495879903435707, -0.02090868167579174, ..."


In [16]:
def distances_from_embeddings(embedding1, embeddings2, distance_metric="cosine"):
    distances = []
    for embedding2 in embeddings2:
        # Compute cosine distance between embeddings
        distance = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
        distances.append(distance)
    return distances

In [17]:
def get_rows_sorted_by_relevance(question, df):
    """
    Function that takes in a question string and a dataframe containing
    rows of text and associated embeddings, and returns that dataframe
    sorted from least to most relevant for that question
    """
    # Get embeddings for the question text
    question_embedding = get_embedding(question)

    # Compute cosine distances between question embedding and embeddings in the DataFrame
    distances = distances_from_embeddings(question_embedding, df["embedded_text"].values)

    # Make a copy of the DataFrame and add a "distances" column
    df_copy = df.copy()
    df_copy["distances"] = distances

    # Sort the copied DataFrame by the distances and return it
    # (shorter distance = more relevant so we sort in ascending order)
    df_copy.sort_values("distances", ascending=False, inplace=True)
    return df_copy

In [18]:
get_rows_sorted_by_relevance(query, df_openai)

Unnamed: 0,Section Title,Content,embedded_text,distances
0,Introduction,The 2025 ICC Women's Cricket World Cup was the...,"[0.012802551500499249, 0.018018405884504318, 0...",0.719651
6,== Match officials ==,"On 11 September 2025, the ICC appointed the of...","[0.00013656189548783004, -0.006997834425419569...",0.632663
13,=== Fixtures ===,The International Cricket Council (ICC) announ...,"[0.04063339903950691, -0.019651418551802635, 0...",0.617708
22,=== Team of the tournament ===,The ICC announced the team of the tournament o...,"[0.001762663945555687, -0.051479026675224304, ...",0.605815
5,== Venues ==,It was originally planned that matches would b...,"[0.009937715716660023, -0.025449691340327263, ...",0.47847
4,== Qualification ==,"The West Indies, semi-finalists at the precedi...","[-0.017495879903435707, -0.02090868167579174, ...",0.478264
2,=== Marketing ===,"Before the commencement of the tournament, the...","[0.03740307688713074, 0.011801231652498245, 0....",0.44109
14,{{Single-innings cricket match,| round = Match 24\n| date = 23 October 2025\...,"[0.005433392245322466, 0.029778052121400833, 0...",0.438922
1,== Background ==,=== Neutral venue arrangements ===\nOn 19 Dece...,"[-0.026573259383440018, -0.03169959783554077, ...",0.417422
16,Pratika Rawal (Ind) became the joint-fastest c...,India's 340 was their highest innings total in...,"[0.05958739295601845, 0.020799176767468452, 0....",0.406376


In [28]:
def create_prompt(question, df):
  prompt_template = """Answer the question based on the context below.\n\nContext:\n{}.\n\nQuestion:\n{}"""
  context = get_rows_sorted_by_relevance(query, df)['Content'].values[:3]
  return prompt_template.format("\n".join(context), question)

In [29]:
prompt = create_prompt(query, df_openai)
print(prompt)

Answer the question based on the context below.

Context:
The 2025 ICC Women's Cricket World Cup was the 13th edition of Women's Cricket World Cup. India hosted the World Cup for the fourth time, after the 1978, 1997 and 2013 editions, with the tournament held from 30 September to 2 November 2025. This was the last time the tournament had eight teams. India became champions after defeating South Africa in the final, securing their maiden World Cup title. Australia where the defending champions, but were knocked out in the Semi-final by eventual champions India.
On 11 September 2025, the ICC appointed the officials for the tournament. This was the first time that whole panel was lead by female officials.
The International Cricket Council (ICC) announced the schedule of the tournament on 16 June 2025. The revised schedule was announced on 22 August 2025..

Question:
Who won the ICC Women's World Cup 2025?


In [30]:
llm_response = call_llm(prompt)

In [31]:
display(Markdown(llm_response))

India won the ICC Women's World Cup 2025.