In [67]:
import os
import time
import csv
import signal
import string
from tqdm import tqdm
from contextlib import contextmanager

import datetime

import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

import requests
from bs4 import BeautifulSoup

from multiprocessing import Pool
from multiprocessing import cpu_count

In [68]:
@contextmanager
def timeout(duration):
    def timeout_handler(signum, frame):
        raise Exception(f'block timedout after {duration} seconds')
    signal.signal(signal.SIGALRM, timeout_handler)
    signal.alarm(duration)
    yield
    signal.alarm(0)

In [69]:
def scroll_down(driver):
    """A method for scrolling the page."""
    
    height = "offsetHeight"

    # Get scroll height.
    last_height = driver.execute_script("return document.body."+height)

    while True:
        
        
        print("scrolling...")
        # Scroll down to the bottom.
        driver.execute_script("window.scrollTo(0, document.body."+height+");")

        # Wait to load the page.
        print("waiting")
        time.sleep(3)
        
        try:
            # Calculate new scroll height and compare with last scroll height.
            with timeout(20):
                new_height = driver.execute_script("return document.body."+height)
        except Exception:
            print('stopped scrolling - time')
            break
            

        if new_height == last_height:
            
            print('stopped scrolling - reached end')
            break

        last_height = new_height
    
    return driver

In [70]:
def get_all_author_profile_hyperlink(hyperlink):
    print("[DEBUG] getting all author links...")
    driver = webdriver.Chrome()

    driver.get(hyperlink)

    # scroll to laod entire page
    driver = scroll_down(driver)

    # get author hyperlinks
    author_profile_hyperlinks = []
    elements = driver.find_elements_by_class_name("poetNameDatePlace")

    for element in tqdm(elements):
        try:
            authoer_profile = element.find_elements_by_tag_name('a')[0].get_attribute('href')
            if authoer_profile!=None or (type(authoer_profile)==str and authoer_profile!=''):
                author_profile_hyperlinks.append(authoer_profile)
            else:
                pass
        except IndexError:
            pass
    driver.close()
    
    return author_profile_hyperlinks

In [71]:
def get_ghazal_hyperlinks_from_author_page(author_page_hyperlink):
    print("[DEBUG] Getting ghazal hyperlinks for author: ", author_page_hyperlink.split("/")[-1])
    driver = webdriver.Chrome()

    driver.get('{}/ghazals'.format(author_page_hyperlink))
    elements = driver.find_elements_by_class_name('rt_bodyTitle')

    author_ghazal_hyperlinks = []
    for element in elements:
        for k in element.find_elements_by_tag_name('a'):
            if k.get_attribute('href')==None:
                pass
            else:
                author_ghazal_hyperlinks.append(k.get_attribute('href'))
    
    driver.close()
    
    return author_ghazal_hyperlinks

In [72]:
def get_beautiful_ghazal_hyperlinks_from_author_page(author_page_hyperlink):
    print("[DEBUG] Getting ghazal hyperlinks for author: ",
          author_page_hyperlink.split("/")[-1])
    
    r = requests.get('{}/ghazals'.format(author_page_hyperlink))
    soup = BeautifulSoup(r.content, 'html5lib')

    author_ghazal_hyperlinks = []
    elements = soup.findAll('div',class_='rt_bodyTitle')
    for element in elements:
        author_ghazal_hyperlinks.append(element.findAll('a')[1].get('href'))

    
    return author_ghazal_hyperlinks

In [73]:
def get_ghazal_from_hyperlink(hyperlink):
    

    complete_ghazals = {
        'en': '',
        'hi': '',
        'ur': ''
    }
    for lang in complete_ghazals:
        driver = webdriver.Chrome()
        if lang=='en':
            
            driver.get(hyperlink)
        else:
            driver.get(hyperlink+'?lang='+lang)
            
    
        double_lines = driver.find_elements_by_class_name('c')

        check_word_text = False
        for double_line in double_lines:
            for line in double_line.find_elements_by_tag_name('p'):
                for word in line.find_elements_by_tag_name('span'):
                    if word.text==None or word.text=='':
                        check_word_text=False
                    else:
                        check_word_text=True
                    complete_ghazals[lang] +=word.text
                if check_word_text:
                    complete_ghazals[lang]+="\n"
            if check_word_text:
                complete_ghazals[lang]+="\n\n"
        driver.close()
    return complete_ghazals


In [74]:
def get_beautiful_ghazal_from_hyperlink(hyperlink):
    
    complete_ghazals = {
        'en': '',
        'hi': '',
        'ur': ''
    }
    for lang in complete_ghazals:
        if lang=='en':
            
            r = requests.get(hyperlink)
        else:
            r = requests.get(hyperlink+'?lang='+lang)
        
        soup = BeautifulSoup(r.content, 'html5lib')
    
        double_lines = soup.findAll("div", class_="c")

        for double_line in double_lines:
            for line in double_line.findAll('p'):
                complete_ghazals[lang] += line.get_text()
                complete_ghazals[lang] += "\n"
            
            complete_ghazals[lang]+="\n\n"
            
    return complete_ghazals


In [75]:
def save_ghazal_in_file(f_name, big_data_dict):
    with open(f_name, 'w') as f:
        w = csv.DictWriter(f, ["author_name", "ghazal_name", "language", "ghazal"])
        w.writeheader()
        for row in big_data_dict:
            if row!=None:
                w.writerow(row)
            else:
                pass
        print("[DEBUG] Saved on file... "+f_name)

In [76]:
def get_all_ghazals_for_poets_whose_name_start_with_char(alphabet='a'):
    
    print("[DEBUG] Getting ghazals for poets whose name start with '{}'".format(alphabet))
    
    big_data_dict = []
    itr = 0
    count = 0
    
    thresh = 100
    
    if alphabet.isalpha:
        
        poet_index_url = "https://www.rekhta.org/poets?startswith=" + alphabet.upper()
        
        author_profile_hyperlinks = get_all_author_profile_hyperlink(poet_index_url)

        for author_profile_link in tqdm(author_profile_hyperlinks):
            author_name = author_profile_link.split('/')[-1]
            
            try:
                ghazal_hyperlinks = get_beautiful_ghazal_hyperlinks_from_author_page(author_profile_link)


                for ghazal_link in ghazal_hyperlinks:
                    count+=1

                    if count%thresh==0:

                        f_name = 'data/ghazals_'+alphabet+'_'+str(int(count/thresh))+datetime.datetime.now().strftime("%Y%m%d%H%M%S")+'.csv'
                        save_ghazal_in_file(f_name, big_data_dict)

                        big_data_dict = []
                        itr = 0

                    else:
                        ghazal_name = ghazal_link.split('/')[-1]

                        ghazals = get_beautiful_ghazal_from_hyperlink(ghazal_link)

                        big_data_dict.extend([None]*(len(ghazals.keys())+1))
                        for lang in ghazals:

                            big_data_dict[itr] = {
                                "author_name":author_name,
                                "ghazal_name": ghazal_name,
                                "language": lang,
                                "ghazal": ghazals[lang]}

                            itr+=1
            except Exception as e:
                print("[DEBUG] Exception encountered: " + e)
                        
        f_name = 'data/ghazals_'+alphabet+'_'+str(int(count/thresh))+datetime.datetime.now().strftime("%Y%m%d%H%M%S")+'.csv'       
        save_ghazal_in_file(f_name, big_data_dict)

        big_data_dict = []
        itr = 0
    else:
        raise Exception("Use an alphabet")

In [77]:
def save_files(pool_flag=False):
    
    alphabets = list(string.ascii_uppercase)
    
    if pool_flag:
        print("creating pools...")
        pool = Pool(cpu_count()*2)

        print("multi pool run...")
        pool.map(get_all_ghazals_for_poets_whose_name_start_with_char,
                     alphabets)
    else:
        print("one pool run...")
        for al in alphabets:
            get_all_ghazals_for_poets_whose_name_start_with_char(al)

In [None]:
save_files(False)

one pool run...
[DEBUG] Getting ghazals for poets whose name start with 'A'
[DEBUG] getting all author links...
scrolling...
waiting
scrolling...
waiting
scrolling...
waiting
scrolling...
waiting


  if sys.path[0] == '':


stopped scrolling - reached end


100%|██████████| 1176/1176 [00:16<00:00, 69.46it/s]
  0%|          | 0/1176 [00:00<?, ?it/s]

[DEBUG] Getting ghazal hyperlinks for author:  a-g-josh


  0%|          | 1/1176 [00:31<10:18:13, 31.57s/it]

[DEBUG] Getting ghazal hyperlinks for author:  a-r-sahil-aleeg


  0%|          | 2/1176 [00:37<5:25:11, 16.62s/it] 

[DEBUG] Getting ghazal hyperlinks for author:  a-d-azhar


  0%|          | 3/1176 [00:38<3:05:45,  9.50s/it]

[DEBUG] Getting ghazal hyperlinks for author:  a-hameed


  0%|          | 4/1176 [00:40<2:04:40,  6.38s/it]

[DEBUG] Getting ghazal hyperlinks for author:  a-khayyam-1


  0%|          | 5/1176 [00:41<1:27:29,  4.48s/it]

[DEBUG] Getting ghazal hyperlinks for author:  adrahi


  1%|          | 6/1176 [00:42<1:04:55,  3.33s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aabid-adeeb


  1%|          | 7/1176 [01:03<2:58:02,  9.14s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aabid-jafri


  1%|          | 8/1176 [01:20<3:46:35, 11.64s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aabid-umar


  1%|          | 9/1176 [01:40<4:36:21, 14.21s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aabida-urooj


  1%|          | 10/1176 [01:42<3:24:43, 10.53s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aabidullah-ghazi


  1%|          | 11/1176 [01:44<2:30:57,  7.77s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aadarsh-dubey


  1%|          | 12/1176 [02:04<3:46:44, 11.69s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aadil-aseer-dehlvi


  1%|          | 13/1176 [02:06<2:46:26,  8.59s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aadil-rasheed


  1%|          | 14/1176 [02:08<2:09:35,  6.69s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aadil-raza-mansoori


  1%|▏         | 15/1176 [02:23<2:58:25,  9.22s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aafaque-siddiqui


  1%|▏         | 16/1176 [02:26<2:17:54,  7.13s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aaftab-lakhnvi


  1%|▏         | 17/1176 [02:27<1:42:10,  5.29s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aaftab-rais-panipati


  2%|▏         | 18/1176 [02:28<1:18:56,  4.09s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aaga-nisaar


  2%|▏         | 19/1176 [02:30<1:08:58,  3.58s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aagha-akbarabadi


  2%|▏         | 20/1176 [03:27<6:17:51, 19.61s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aagha-babar


  2%|▏         | 21/1176 [03:28<4:29:32, 14.00s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aah-sambhali
[DEBUG] Saved on file... data/ghazals_A_120220309185330.csv


  2%|▏         | 22/1176 [03:53<5:33:37, 17.35s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aajiz-hengan-ghati


  2%|▏         | 23/1176 [04:01<4:37:17, 14.43s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aajiz-meer-puri


  2%|▏         | 24/1176 [04:02<3:18:34, 10.34s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aakif-ghani


  2%|▏         | 25/1176 [04:08<2:52:56,  9.01s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aalam-nizami


  2%|▏         | 26/1176 [04:26<3:44:52, 11.73s/it]

[DEBUG] Getting ghazal hyperlinks for author:  aale-ahmad-suroor
