In [1]:
import requests
from bs4 import BeautifulSoup

from tqdm.notebook import tqdm
from IPython.display import clear_output

### Links collection

In [2]:
class WikiParser():
    def __init__(self,
                 start_url='https://pa.wikipedia.org/w/index.php?title=ਖ਼ਾਸ:ਸ਼੍ਰੇਣੀਆਂ&dir=prev&offset=19ਵੀਂ_ਸਦੀ_ਵਿੱਚ_ਆਇਰਲੈਂਡ&limit=500',
                 domain = 'https://pa.wikipedia.org',
                 out_file = 'wiki_links_pa.txt'):
        self.domain = domain
        self.page_links = set()
        self.out_file = out_file
        self.start_url = start_url
        self.current_chain = []
        self.cycle = False
        
    def get_html_page(self, url):
        try:
            response = requests.get(url)
            return response.text if response.ok else None
        except:
            return None
        
    def get_next_link(self, soup):
        try:
            link = soup.find('div', {'class': 'mw-pager-navigation-bar'}).find('a', {'class': 'mw-nextlink'})['href']
            return self.domain + link
        except TypeError as e:
            return None
    
    def exctract_article_links(self, soup):
        pages = soup.find('div', {'id': 'mw-pages'})
        if not pages:
            return set()
        pages = pages.find_all('li')
        pages = [self.domain + page.find('a')['href'] for page in pages]
        self.cycle = False
        self.current_chain = []
        return set(pages)
        
    
    def extract_batch_articles(self, url):
        print(url)
        self.current_chain.append(url)
        html = self.get_html_page(url)
        if not html: return
        soup = BeautifulSoup(html)
        
        article_links = self.exctract_article_links(soup)
        
        article_links = article_links.difference(self.page_links)
        self.save_new_links_batch(article_links)
        self.page_links = self.page_links.union(article_links)
        self.log()
        subcategories = soup.find('div', {'id': 'mw-subcategories'})

        if subcategories:
            for subcategory in subcategories.find_all('div', {'class': 'mw-category-group'}): 
                for tag in subcategory.find_all('a'):
                    if self.current_chain:
                        if max(Counter(self.current_chain).values()) > 1:
                            self.cycle = True
                            break
                        self.extract_batch_articles(self.domain + tag['href'])
                if self.cycle:
                    self.cycle = False
                    self.current_chain = []
                    break
        else: pass
    
    def get_all_category_links(self, url):
        html = self.get_html_page(url)
        soup = BeautifulSoup(html)
        category_links = []
        tags = soup.find('div', {'class': 'mw-spcontent'}).find_all('li')

        # Find all valid categories
        for tag in tags:
            link = tag.find('a')['href']

            if tag.text.find('(0 ਮੈਂਬਰ)') != -1 or link.find('redlink') != -1:
                continue
            
            category_links.append(self.domain + link)

        return category_links        
        
    
    def get_all_page_links(self):
        url = self.start_url
        while True:
            # Here we get the page with categories
            html = self.get_html_page(url)
            soup = BeautifulSoup(html)
            
            # Here we get all links from current category
            category_links = self.get_all_category_links(url)

            # Loop through category links and adding urls to articles
            for link in category_links:
                title = link.split('/')[-1]
                self.save_current_category(title)
                self.extract_batch_articles(link)
            
            # Here we get link to next page with categories
            next_url = self.get_next_link(soup)
            if not next_url: break
            
            # Go to next page
            url = next_url
    
    def save_current_category(self, title):
        with open(self.out_file, mode='a') as f:
            f.write('{{{' + title + '}}}\n')
                       
    def save_new_links_batch(self, links_batch):
        links_batch = list(links_batch)
        with open(self.out_file, mode='a') as f:
            for link in links_batch:
                f.write(link + '\n')
    
    def log(self):
        clear_output(wait=True)
        print('Total num of links:', len(self.page_links))

In [3]:
# Create WikiParser object and parse all punjabi article links
Parser = WikiParser()
Parser.get_all_page_links()

Total num of links: 2
https://pa.wikipedia.org/wiki/%E0%A8%B8%E0%A8%BC%E0%A9%8D%E0%A8%B0%E0%A9%87%E0%A8%A3%E0%A9%80:%E0%A8%AE%E0%A9%8C%E0%A8%A4_1190



KeyboardInterrupt



### Article`s text collection

In [15]:
class ArticlesCollection():
    def __init__(self, page_links=[], filename='./wiki_links_pa.txt'):
        self.page_links = []
        self.filename = filename
        self.dirty_corpus = []
    
    def get_html_page(self, url):
        try:
            response = requests.get(url)
            return response.text if response.ok else None
        except:
            return None
    
    def make_list_from_file(self):
        with open(self.filename, mode='r') as f:
            while True:
                line = f.readline()[:-1]
                if not line: break
                if line.find('{{{') == -1: 
                    self.page_links.append(line)
    
    def make_raw_corpus(self, num_links=None):
        links_processed = self.page_links[:num_links] if num_links else self.page_links
        for link in tqdm(links_processed):
            html = self.get_html_page(link)
            if not html: continue
            soup = BeautifulSoup(html)
            
            title = soup.find('h1', {'id': 'firstHeading'}).text
            tags = soup.find('div', {'id': 'bodyContent', 'class': 'vector-body'}).find_all('p')
            
            text = [tag.text for tag in tags]
            text = ' '.join(text)
            text = '<>'.join((title, text))
            text = '<>'.join((link, text))

            self.dirty_corpus.append(text)

In [16]:
PagesCollector = ArticlesCollection()

PagesCollector.make_list_from_file()
PagesCollector.make_raw_corpus()

  0%|          | 0/36852 [00:00<?, ?it/s]

### Saving raw texts with "^^^^^^" separator

In [89]:
with open('dirty_corp_wiki_pa.txt', 'w') as f:
    for line in PagesCollector.dirty_corpus:
        f.write(line)
        f.write('^^^^^^')