**Scraping the data from the given website** **and converting into csv and preprosessing tha extracted data**

In [18]:
import requests
from bs4 import BeautifulSoup
import csv

url = "https://courses.analyticsvidhya.com/collections/courses"

response = requests.get(url)

if response.status_code == 200:

    soup = BeautifulSoup(response.content, 'html.parser')

    box = soup.find('div', class_='collections__container')

    transcript = box.find('div', class_='collections__product-cards collections__product-cards___0b9ab').get_text()

    lines = transcript.split('\n')

    course_data = []

    course_title = None
    lessons = None
    category = "All Courses"

    for line in lines:
        line = line.strip()
        if not line:
            continue
        if line.endswith("Lessons"):
            lessons = line
        elif "Free" in line:

            course_data.append([course_title, lessons, category, "Free"])
        else:
            course_title = line


    csv_filename = "/content/courses_data .csv"


    with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        writer.writerow(["Course Title", "Number of Lessons", "Category", "Price"])
        writer.writerows(course_data)

    print(f"Data successfully written to {csv_filename}")

else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")


Data successfully written to /content/courses_data .csv


**Building the search tool**

 embeddings for course data

In [19]:
!pip install sentence-transformers


import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

df = pd.read_csv('/content/courses_data .csv')

print("Column names:", df.columns)  # Display the column names

df.columns = df.columns.str.strip()

if 'Course Title' in df.columns:
    print("Missing values in 'Course Title':", df['Course Title'].isnull().sum())

    df = df.dropna(subset=['Course Title'])

    model = SentenceTransformer('all-MiniLM-L6-v2')

    course_embeddings = model.encode(df['Course Title'].tolist())

    np.save('course_embeddings.npy', course_embeddings)

    print("Embeddings generated and saved successfully!")
else:
    print("Cannot generate embeddings because 'Course Title' column is missing.")



Column names: Index(['Course Title', 'Number of Lessons', 'Category', 'Price'], dtype='object')
Missing values in 'Course Title': 0




Embeddings generated and saved successfully!


**Deploying the tool on Huggingface Spaces:**

In [20]:
!pip install gradio

import gradio as gr
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv('/content/courses_data .csv')
courses = df['Course Title'].tolist()  # Extract course titles

model = SentenceTransformer('all-MiniLM-L6-v2')

course_embeddings = model.encode(courses)

def search_courses(query):

    query_embedding = model.encode([query])


    similarities = cosine_similarity(query_embedding, course_embeddings)

    top_indices = similarities[0].argsort()[-5:][::-1]

    results = [(courses[i], similarities[0][i]) for i in top_indices]
    return results

# Gradio interface

iface = gr.Interface(
    fn=search_courses,
    inputs=gr.Textbox(label="Enter your search query"),
    outputs=gr.Dataframe(headers=["Course Title", "Similarity Score"], label="Search Results"),
    title="Smart Course Search Tool",
    description="Enter keywords to find the most relevant courses."
)

iface.launch()






Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6d99c615bdbb66c8cc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


