## Data Scraping

In [None]:
%pip install selenium==4.10.0
%pip install pandas
%pip install webdriver-manager

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from urllib.request import Request, urlopen
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import warnings
import json
import time
import os

In [None]:
def driversetup():
    service = Service(executable_path=ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument("lang=en")
    options.add_argument("start-maximized")
    options.add_argument("disable-infobars")
    options.add_argument("--disable-extensions")
    options.add_argument("--incognito")
    options.add_argument("--disable-blink-features=AutomationControlled")
    driver = webdriver.Chrome(service=service, options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")
    return driver

def pagesource(url, driver):
    driver = driver
    driver.get(url)
    soup = BeautifulSoup(driver.page_source)
    questions = [x.get("href") for x in soup.findAll('a') if str(x.get("href")).startswith('/questions/')][1:]
    return questions

In [None]:
tags = [
    "99724818299614502762190973596969",
    "416658401661843526040169665289086",
    "876619786935845126962162607976597",
    "773921536755532122004239005965168",
    "250948378054223096392454848767354",
    "182542228769759641292999239253882",
    "859834545111167391953063734572784",
    "212358834767912649313917434384826",
    "410250962940517507034023885688755",
    "443595830163800786360189759964915",
    "188663251671469173336120566262897",
    "132949817163443344955330185779754",
    "bd524d9b-1ee4-452d-a5b4-c25520976179",
    "0c9ec02c-46fe-498e-a301-66c5a13461e9",
    "01200615320800000636"
]

In [None]:
driver = driversetup()
questions = []
for tag in tags:
    print("tag:", tag)
    for i in range(1, 130):
        print("page:", i)
        try:
            url = f"https://answers.sap.com/tags/{tag}?page={i}&pageSize=15&sort=active&filter=accepted"
            print(url)
            source = pagesource(url, driver)
            if len(source) == 0: break
            else: questions.extend(pagesource(url, driver))
        except: print(f"Error at link {url}")

In [None]:
base = "https://answers.sap.com"
data = []
for i, page in enumerate(questions):
    url = base + page
    driver.get(url)
    soup = BeautifulSoup(driver.page_source)
    row = {}
    row['id'] = page.split('/')[2]
    row['url'] = url
    row['title'] = soup.findAll("h1", {"class": "ds-question__title"})[0].text
    row['question'] = soup.findAll("div", {"class": "ds-question__content"})[0].text
    row['answer'] = soup.findAll("div", {"class": "ds-answer__content"})[0].text

    data.append(row)
    if i % 200 == 0: print(len(data))

# question input

In [None]:
question = input("Enter your question: ")

# Keyword Extraction

In [None]:
%pip install rake-nltk

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Collecting nltk<4.0.0,>=3.6.2 (from rake-nltk)
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk, rake-nltk
Successfully installed nltk-3.8.1 rake-nltk-1.0.6
Note: you may need to restart the kernel to use updated packages.


In [None]:
from rake_nltk import Rake
import nltk
import pandas as pd

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Romex\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Romex\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Text to extract keywords from
text = "The quick brown fox jumps over the lazy dog. The dog barks loudly."

columns = ['keywords', 'word embeddings' , 'cosine similarity']

df = pd.DataFrame(columns=columns)

display(df)

# Create a Rake object
r = Rake()

# Extract keywords
r.extract_keywords_from_text(text)

# Get the ranked list of keywords
keywords = r.get_ranked_phrases()[:1]  # Extract the top 3 keywords

keywords_list = []

# Print the keywords
for keyword in keywords:
    print(keyword)
    df.loc[len(df)] = [keyword, 0,0]


Unnamed: 0,keywords,word embeddings,cosine similarity


quick brown fox jumps


In [None]:
display(df)

Unnamed: 0,keywords,word embeddings,cosine similarity
0,quick brown fox jumps,0,0


# Word Embedding

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# embeddings = model.encode(sentences)
question_emb = model.encode(question)
embeddings = []
for i in range(len(df)):
    # print(model.encode(df.iloc[i, 0]))
    df.iloc[i,1] = str(model.encode(df.iloc[i, 0]))
    # print(embeddings)


# Cosine Similarity

In [None]:
import numpy as np
from numpy.linalg import norm

In [None]:
for i in range
cosine = np.dot(question_emb,B)/(norm(question_emb)*norm(B))
print("Cosine Similarity:", cosine)
