# Importing Required Libraries

In [1]:
import re
import os
import multiprocessing
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
import requests
import html5lib
from os import listdir
from os.path import join, abspath
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi

# Making a crawler using BeautifulSoup

In [2]:
class Crawler:
    
    def __init__(self, seed_url):
        self.seed_url = seed_url
        self.root_url = '{}://{}'.format(urlparse(self.seed_url).scheme, urlparse(self.seed_url).netloc)
        self.pool = ThreadPoolExecutor(max_workers=5)
        self.scraped_pages = set([])
        self.crawl_queue = Queue()
        self.crawl_queue.put(self.seed_url)

    def parse_links(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        Anchor_Tags = soup.find_all('a', href=True)
        for link in Anchor_Tags:
            url = link['href']
            if url.startswith('/') or url.startswith(self.root_url):
                url = urljoin(self.root_url, url)
            if url not in self.scraped_pages:
                self.crawl_queue.put(url)

    def scrape_info(self, html):
        soup = BeautifulSoup(html, "html5lib")
        web_page_paragraph_contents = soup('p')
        text = ''
        for para in web_page_paragraph_contents:
            if not ('https:' in str(para.text)):
                text = text + str(para.text).strip()
        return

    def post_scrape_callback(self, res):
        result = res.result()
        if result and result.status_code == 200:
            self.parse_links(result.text)
            self.scrape_info(result.text)

    def scrape_page(self, url):
        try:
            res = requests.get(url, timeout=(3, 30))
            return res
        except requests.RequestException:
            return

    def run_crawler(self):
        for i in range(1,2000):
            try:
                target_url = self.crawl_queue.get(timeout=60)
                if target_url not in self.scraped_pages:
                    print("Crawling URL: {}".format(target_url))
                    self.current_scraping_url = "{}".format(target_url)
                    self.scraped_pages.add(target_url)
                    job = self.pool.submit(self.scrape_page, target_url)
                    job.add_done_callback(self.post_scrape_callback)
            except Empty:
                return
            except Exception as e:
                print(e)
                continue

    def info(self):
        print('Crawled pages are: ', self.scraped_pages, '\n')

    def save_to_text(self):
        file = open('link_data.txt', 'x', encoding="UTF-8")
        for i in self.scraped_pages:
            file.write(i+'\n')
        file.close()
        print('File Created')

# Giving a seed link to crawl the webpages

In [4]:
es = Crawler("https://medium.com/")
es.run_crawler()    
es.info()
es.save_to_text()

Crawling URL: https://medium.com/
Crawling URL: https://medium.com/about?autoplay=1
Crawling URL: https://medium.com/membership
Crawling URL: https://about.medium.com/creators/
Crawling URL: https://medium.com/m/signin?operation=login&redirect=https%3A%2F%2Fmedium.com%2F&source=--------------------------lo_home_nav-----------
Crawling URL: https://medium.com/m/signin?operation=register&redirect=https%3A%2F%2Fmedium.com%2F&source=--------------------------lo_home_nav-----------
Crawling URL: https://medium.com/@x_TomCooper_x?source=home---------0---------------------2e6ea8c8_9be9_4a0c_9128_333094b012cc-------7
Crawling URL: https://medium.com/@x_TomCooper_x/ukraine-war-19-december-2022-d7b25da87e10?source=home---------0---------------------2e6ea8c8_9be9_4a0c_9128_333094b012cc-------7
Crawling URL: https://medium.com/@tedbauer?source=home---------1---------------------2e6ea8c8_9be9_4a0c_9128_333094b012cc-------7
Crawling URL: https://tedbauer.medium.com/university-of-idaho-murders-now-it

# Getting all the links in URL to list

In [8]:
lis = []
lenstr = len("https://medium.com/")
with open('link_data.txt','r') as f:
    for line in f:
        k = line.strip()
        if k[:lenstr] ==  'https://medium.com/':
            lis.append(line.strip())
print(len(lis))

646


# Dowloading all web pages with tags p (a.k.a. Paragraph)

In [9]:
def downloader(link):
    req = requests.get(link)
    req.encoding = "utf8"
    return req.text

k=0
lenstr = len("https://medium.com/")
for i in range(len(lis)):
    if lis[i][:lenstr]=="https://medium.com/":
        name = f'temper/'+"doc"+str(k)+".txt"
        contents = downloader(lis[i])
        soup = BeautifulSoup(contents, "html5lib")
        res = ''
        for i in range(len(soup.find_all('p'))):
            text = soup.find_all('p')[i].get_text()
            res = res + text
        with open(name,'a', encoding="UTF-8") as f1:
            f1.write(res)
            k+=1
        f1.close()
f1.close()

# Removing all text files of scraped sites which are empty

In [14]:
k = 2
for i in range(2, 646):
    if os.path.getsize(f'temper\doc{i}.txt') > 0:
        os.rename(f'temper\doc{i}.txt', f'temper\{k}.txt')
        k+=1
    else:
        os.remove(f'temper\doc{i}.txt')

# Getting all files in the directory of files

In [4]:
data_dir = 'temper/'
DATA_SET_DIR = abspath(data_dir)
print('\nGetting List of text files from' + DATA_SET_DIR)
files = listdir(DATA_SET_DIR)
print('\nFile list retrieved from ' + DATA_SET_DIR)


Getting List of text files fromd:\B.Tech\Semester - VII\CSE 419 Information Retrieval\Project\Docs

File list retrieved from d:\B.Tech\Semester - VII\CSE 419 Information Retrieval\Project\Docs


# Getting Porter Stemming and stopwords

In [8]:
ps = PorterStemmer()
nltk_stop_words = set(stopwords.words('english'))

# Getting all the text after pre processing into list

In [1]:
corpus = []
for f in files:
	strm = open(DATA_SET_DIR + '/' + f, 'r', encoding="utf8")
	words = word_tokenize(strm.read())
	words = filter(lambda w: w not in nltk_stop_words, words)
	words = map(lambda w: ps.stem(str(w)), words)
	corpus.append(' '.join(words))

NameError: name 'files' is not defined

# Getting tf-idf and making vectors

In [10]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# Making the Matrix of Cosine Similarity

In [11]:
cosine_matrix = cosine_similarity(tfidf_matrix)

# Printing the cosine similarty matrix

In [12]:
print(cosine_matrix)

[[1.         0.         0.00358598 ... 0.15183363 0.         0.01024462]
 [0.         1.         0.04733177 ... 0.         0.09065896 0.03864335]
 [0.00358598 0.04733177 1.         ... 0.00478615 0.07078646 0.02634958]
 ...
 [0.15183363 0.         0.00478615 ... 1.         0.         0.04862246]
 [0.         0.09065896 0.07078646 ... 0.         1.         0.04791486]
 [0.01024462 0.03864335 0.02634958 ... 0.04862246 0.04791486 1.        ]]


# Saving the cosine similarty matrix

In [13]:
cosine = []
for idx, row in enumerate(cosine_matrix):
    cosine.append(str(row))
with open('cosine_similarity.txt', 'w') as f:
    for line in cosine:
        f.write(f"{line}\n")

# Taking and Dividing the Corpus and to search

In [14]:
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

# Taking the Query from the user and and using bm25 to rank the documents

In [15]:
query = input()
tokenized_query = query.split(" ")
doc_scores = bm25.get_scores(tokenized_query)

# Getting the top 10 documents of the Query

In [16]:
pi = bm25.get_top_n(tokenized_query, corpus, n=10)
index = [corpus.index(i) for i in pi]

# The files retrived

In [17]:
index

[291, 376, 44, 1021, 559, 988, 295, 577, 165, 345]

# The Content of top 10 Documents

In [18]:
print(pi)

["- widget : live score & news- watch app- dark mode- # askcricinfofollow cricket brand new espncricinfo app go easi & provid seamless experi web app . the app light , fast widest coveragediscov espncricinfo ’ content app . widest cricket coverag across world includ ipl , psl , bpl , bbl , cpl , wbbl , the ash , icc cricket world cup , t20 world cup , the hundr , counti championship , ranji trophi , sheffield shield.get : - fast live score ball ball commentary- notif updat live cricket matches- easi read latest cricket news- cricket video includ highlight , analysi , interview , press conferences- get expert opinion daniel vettori , robin uthappa , muttiah muralitharan , gautam gambhir , tom moodi , sanjay manjrekar , ajit agarkar , aakash chopra , deep dasgupta & more.visit websit http : //www.espncricinfo.com follow us @ espncricinfo twitter like us http : //www.facebook.com/cricinfo latest updates.term use - http : //disneytermsofuse.com/privaci polici - http : //www.disneyprivacyce