In [2]:
from urllib.parse import urlparse, urljoin, unquote
import pandas as pd
import wikipedia
from bs4 import BeautifulSoup
from fastcore.parallel import *
from functools import partial
from time import sleep
wikipedia.set_lang('or')


In [3]:
import logging
import traceback

def setup_logging(log_file: str = "app.log"):
    # Create a custom logger
    logger = logging.getLogger("LinkExtractor")
    logger.setLevel(logging.INFO)  # Capture only INFO and above (INFO, WARNING, ERROR, CRITICAL)

    # Prevent logger duplication
    if not logger.hasHandlers():
        # Create file handler to log to a file
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(logging.INFO)  # File handler also captures only INFO and above

        # Create log format with function name included
        log_format = logging.Formatter('%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
        file_handler.setFormatter(log_format)

        # Add handler to the logger
        logger.addHandler(file_handler)

    return logger

# Step 2: Set up logging to a file only, suppressing output in the notebook
logger = setup_logging(log_file="notebook_logs.log")

# Step 3: Example usage
logger.info("This is an INFO log message.")
logger.debug("This DEBUG message won't appear in INFO mode.")
logger.warning("This is a WARNING log message.")

In [4]:
!ls

Untitled-1.ipynb   get_all_odia_web_pages.ipynb  process_wikipedia_web2md.ipynb
content_links.csv  notebook_logs.log		 temp.md


In [4]:
df = pd.read_csv('content_links.csv')
df.head

<bound method NDFrame.head of                                                    links
0               https://or.wikipedia.org/wiki/0_(number)
1                      https://or.wikipedia.org/wiki/100
2                     https://or.wikipedia.org/wiki/1000
3                     https://or.wikipedia.org/wiki/1001
4                     https://or.wikipedia.org/wiki/1002
...                                                  ...
38106  https://or.wikipedia.org/wiki/‌‌କେନ୍ଦ୍ରାପଡ଼ା_ଜ...
38107      https://or.wikipedia.org/wiki/‌‌ନଗର,_ରାଜସ୍ଥାନ
38108        https://or.wikipedia.org/wiki/‌‌ନଗରକୁର୍ନୁଲ୍
38109  https://or.wikipedia.org/wiki/‌‌ନାଗରୀ,_ଆନ୍ଧ୍ରପ...
38110        https://or.wikipedia.org/wiki/﻿﻿Abelisaurus

[38111 rows x 1 columns]>

In [5]:
pg = urlparse(df.iloc[0]['links']).path.split('/wiki/')[1]
pg

'0_(number)'

In [6]:
ignore_txts = [
    'ଏହି ଲେଖାଟି ବଟ୍ ବ୍ୟବହାର କରି ତିଆରି କରାଯାଇଛି ଏବଂ ଏହା ପର୍ଯ୍ୟାୟ କ୍ରମେ ଉନ୍ନତି ହେଉଅଛି । ଏଣୁ ଏହି ଛାଞ୍ଚ ନଉଠିବା ପର୍ଯ୍ୟନ୍ତ କୌଣସି ବଦଳ କରନ୍ତୁ ନାହିଁ । ଯଦି ଆପଣ କୌଣସି ଗୁରୁତ୍ତ୍ୱପୂର୍ଣ୍ଣ ତ୍ରୁଟି ଆପଣଙ୍କ ଦୃଷ୍ଟିଗୋଚର ହେଉଅଛି ତେବେ ପୃଷ୍ଠା ଆଲୋଚନା‌ରେ ସୂଚିତ କରନ୍ତୁ ।',
    """୧ ଘଟଣା

୧.୧ ଜାନୁଆରୀ-ମାର୍ଚ୍ଚ
୧.୨ ଅପ୍ରେଲ-ଜୁନ
""", # for removing page like https://or.wikipedia.org/wiki/%E0%AD%A7%E0%AD%A6%E0%AD%A6
    ]

In [8]:
import re
from functools import reduce

# Define patterns for removing markdown and wrapped substrings
MARKDOWN_PATTERNS = [
    (re.compile(r'==+.*?==+', re.MULTILINE), ''),  # Remove substrings wrapped in === or ==    
    (re.compile(r'\s+\{.*\}\s*', re.DOTALL), ''),  # Remove LaTeX-style expressions

    (re.compile(r'\n[A-Za-z0-9,.\'\"!?;:()⋅−\-\{\}= \s]+\n', re.MULTILINE), ''),  # Remove lines with only English text
    (re.compile(r'[\n\t]+'), r'\n'),  # Remove multiple newlines

]



def remove_markdown(text, log_steps=False):
    if log_steps  : print(f"Original Text:\n{text}")
    
    def apply_and_print(text, pattern, log_steps=False):
        updated_text = pattern[0].sub(pattern[1], text)
        if log_steps  : print(f"After applying pattern {pattern[0].pattern}:\n{updated_text}\n") 
        return updated_text
    
    # Apply all the patterns defined in MARKDOWN_PATTERNS and print after each substitution
    final_text = reduce(lambda t, pattern: apply_and_print(t, pattern), MARKDOWN_PATTERNS, text)
    
    if log_steps  :  print(f"Final Text:\n{final_text}")
    return final_text


In [14]:
def all_strings_in_base(txt):
    """
    takes in html string and check if ignore_texts is present 
    by using beutifulsoup4
    """
    soup = BeautifulSoup(txt, 'html.parser')
    txt =  soup.get_text()
    
    return any(substr in txt for substr in ignore_txts)

def find_ref_pos(txt):
    #print(f"{txt=}")
    return txt[:txt.find('== ଆଧାର ==')]
    
def extract_txt(url):
    try:
        logger.info(f"for url :{url}")

        pg = urlparse(url).path.split('/wiki/')[1]
        logger.info(f"got title {pg}")

        page = wikipedia.page(url)


        #check if given page have ignore_txt
        if all_strings_in_base(page.html()):
            return ''
        

        content = page.content

        #find the index of '== ଆଧାର =='
        if '== ଆଧାର ==' in content:
            content = find_ref_pos(content)

        # for removing '\n\n\n== ଘଟଣା ==\n\n\n==='
        if '==' in content:
            content = remove_markdown(content)
        
        logger.info(f"size of content {len(content)}")
        sleep(0.25)
        return content
    
    except Exception as e:
        logger.error(f"Error getting following siblings: {str(e)}")
        logger.error(traceback.format_exc())
        return "!!!ERROR!!!"
        


As everthing after `== ଆଧାର ==` is like external links and refernece so removing those.

In [10]:
links = df['links'].to_list()
len(links) == len(df)

True

In [13]:
extract_txt(links[0])

"ବାସ୍ତବ ସଂଖ୍ୟା ସମୂହରେ ଶୁନ୍ୟ (୦)  ହେଉଛି ଏକ ଯୁଗ୍ମ ସଂଖ୍ୟା ।ଶୁନ୍ୟଦ୍ୱାରା ଯେ କୌଣସି ସଂଖ୍ୟାକୁ ଗୁଣନ କଲେ, ଗୁଣଫଳ ସର୍ବଦା ଶୁନ୍ୟ ହୁଏ ।\nଶୁନ୍ୟଦ୍ୱାରା ଭାଗଫଳ ଏକ ଅସମ୍ଭବ ପ୍ରକ୍ରିୟା ।\nଯେ କୌଣସି ସଂଖ୍ୟାରେ ଶୁନ୍ୟ (୦) ଯୋଗ କଲେ ମୂଳ ସଂଖ୍ୟାର ମୂଲ୍ୟରେ ପରିବର୍ତ୍ତନ ହୁଏ ନାହିଁ । ଅତଏବ ଶୁନ୍ୟ (୦)କୁ ଯୋଗାତ୍ମକ ଅଭେଦ କୁହାଯାଏ ।'"

In [17]:
texts = parallel(extract_txt, links, progress=True, n_workers=8)

In [None]:
data = {
    'link': links,
    'text': texts
}
data_df = pd.DataFrame.from_dict(data)
data_df.head()
data_df.to_csv("odia_wiki.csv", index=False )