In [1]:
import requests
from bs4 import BeautifulSoup
from string import ascii_uppercase #alphabet
from  nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import pandas as pd
import re
import random
from collections import Counter
from tqdm import tqdm
import time 
import concurrent.futures as cp

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Scraping readlighnovel.me to create a light novel dataset

## Scraping novel list page to get links to all available novels

In [2]:
START_URL = 'https://www.readlightnovel.me/novel-list'

In [3]:
# get a list of urls to different light novels
novel_links = []
for letter in ascii_uppercase:
  # Handle all of the letter names
  r = requests.get(START_URL+'/'+letter, stream=True)
  soup = BeautifulSoup(r.text, 'html.parser')
  li_s = [a.find('a') for a in soup.find_all('li')]
  for li in li_s:
    if li.has_attr('data-wrapper'):
      novel_links.append(li.get('href'))

# For non-letter first character the website simply uses the / so we get those too
r = requests.get(START_URL+'/', stream=True)
soup = BeautifulSoup(r.text, 'html.parser')
li_s = [a.find('a') for a in soup.find_all('li')]
for li in li_s:
  if li.has_attr('data-wrapper'):
    novel_links.append(li.get('href'))

In [4]:
# Number of links/ novels we got
len(novel_links)

5681

## For each novel we plan to extract:
* Name
* Description
* Number of chapters
* Rating
* Genre
* Views
* Random Chapter?



### Trying for one novel first

In [5]:
# Which novel are we getting:
link = novel_links[0]
print(link)

https://www.readlightnovel.me/a-barbaric-proposal


In [6]:
r = requests.get(link, stream=True)
soup = BeautifulSoup(r.text, 'html.parser')

#### Title

In [7]:
# title
print(soup.find('div', {'class':'block-title'}).text)


A Barbaric Proposal



#### Detail
I was going for just genre, but the website made it super convenient to get a bunch of metadata, so I just grab it all

In [8]:
# Getting all of the details
details = soup.findAll('div', {'class':'novel-detail-item'})
detail_content = {}
for detail in details:
  try:
    # Match header and content into a dict
    header = detail.find('div',{'class': 'novel-detail-header'}).text.strip()
    detail_content[header] = detail.find('div',{'class': 'novel-detail-body'}).text.strip().split('\n')
    
    # breaks if we go after rating so we break for loop
  except:
    #print(detail.findAll('a'))
    None

In [9]:
detail_content

{'Alternative Names': ['ABP 야만의 청혼'],
 'Artist(s)': ['N/A'],
 'Author(s)': ['齐成琨'],
 'Description': ['“I already have a child with another man.”',
  'It was a lie made in order to refuse the proposal of the leader of the most feared mercenary company in the entire country. But his response was unexpected.',
  '“It doesn’t matter who the child is. Just birth it.”',
  'She couldn’t understand.',
  'What could this man possibly want with the princess of a failing nation?'],
 'Genre': ['Drama', 'Fantasy', 'Josei', 'Romance'],
 'Language': ['Korean'],
 'Latest Chapters': ['Chapter 52',
  ' Chapter 51',
  ' Chapter 50',
  ' Chapter 49',
  ' Chapter 48'],
 'Rating': ['7.5'],
 'Status': ['Ongoing'],
 'Tags': ['N/A'],
 'Total Views': ['24791'],
 'Type': ['Web Novel'],
 'Year': ['N/A'],
 'You May Also Like': ['Totem(Ongoing)',
  'The Eldest Daughter is Beautiful and Saucy(Ongoing)',
  'Quick Transmigration: Male Lead, You’re Overpowered?(Ongoing)',
  'I’m Only a Stepmother, but My Daughter is Ju

#### Chapter count

In [10]:
# Getting the latest chapter number 
detail_top = soup.findAll('div', {'class':'novel-detail-body'})
chap_count = int(re.findall(r'([0-9]+)', detail_top[-1].find('a').get('href'))[-1])
print(chap_count)

52


#### Content from a random chapter

In [11]:
# Generate a number from 1 to chapter count and get the content of that chapter
rand_chapter = random.randint(1, chap_count)
print(rand_chapter)
rand_chapter = 51
r = requests.get(link+'/chapter-'+str(rand_chapter), stream=True)
soup = BeautifulSoup(r.text, 'html.parser')

# since content is copyrighted we extract only word counts
content = [par.text for par in soup.find('div', {'class':'desc'}).findAll('p')]
# Join to string, extract all of the unicode characters, and remove stopwords
content = [word for word in re.findall(r'\w+', ' '.join(content)) if word not in stopwords.words('english')]
# get counts of the output
content = Counter(content).most_common(100)
content

4


[('Rienne', 273),
 ('Black', 225),
 ('I', 207),
 ('like', 66),
 ('face', 45),
 ('What', 42),
 ('would', 39),
 ('She', 36),
 ('hand', 36),
 ('eyes', 33),
 ('It', 33),
 ('know', 33),
 ('felt', 33),
 ('But', 33),
 ('Mrs', 30),
 ('Flambard', 30),
 ('And', 30),
 ('lips', 30),
 ('one', 30),
 ('He', 30),
 ('Princess', 27),
 ('sleep', 27),
 ('close', 27),
 ('The', 27),
 ('right', 24),
 ('feeling', 24),
 ('words', 24),
 ('think', 24),
 ('back', 24),
 ('You', 24),
 ('That', 21),
 ('bed', 21),
 ('could', 21),
 ('saying', 21),
 ('said', 21),
 ('talking', 21),
 ('things', 21),
 ('away', 21),
 ('blanket', 21),
 ('time', 18),
 ('head', 18),
 ('something', 18),
 ('If', 18),
 ('still', 18),
 ('pulled', 18),
 ('wanted', 18),
 ('found', 18),
 ('kiss', 18),
 ('Then', 15),
 ('need', 15),
 ('So', 15),
 ('really', 15),
 ('Well', 15),
 ('much', 15),
 ('coming', 15),
 ('turned', 15),
 ('took', 15),
 ('anything', 15),
 ('heart', 15),
 ('already', 15),
 ('going', 15),
 ('say', 15),
 ('worried', 15),
 ('together'

### Repeat for all of the novels

In [12]:
def get_link(link, N_common=100):
  r = requests.get(link, stream=True)
  soup = BeautifulSoup(r.text, 'html.parser')
  novel_data = {}

  # name
  novel_data['title'] = soup.find('div', {'class':'block-title'}).text.strip()

  # details
  details = soup.findAll('div', {'class':'novel-detail-item'})
  for detail in details:
    try:
      # Match header and content into a dict
      header = detail.find('div',{'class': 'novel-detail-header'}).text.strip()
      novel_data[header] = detail.find('div',{'class': 'novel-detail-body'}).text.strip().split('\n')
      
      # breaks if we go after rating so we break for loop
    except:
      # printing to keep track of what we missed
      #print(detail.findAll('a'))
      None

  # Chapter Count
  try:
    detail_top = soup.findAll('div', {'class':'novel-detail-body'})
    # We added a small hack here for novels that end in non-numbered chapters
    # Drawback being that if we try to get the content of these chapters we will get errors
    novel_data['chap_count'] = int(re.findall(r'([0-9]+)', detail_top[-1].findAll('a')[-3].get('href'))[-1])+2

    # Random chapter counter
    novel_data['rand_chapter'] = random.randint(1, chap_count) #inclusive so we are good :)

    r = requests.get(link+'/chapter-'+str(novel_data[rand_chapter]), stream=True)
    soup = BeautifulSoup(r.text, 'html.parser')
  except:
    None
      

  try:
    # since content is copyrighted we extract only word counts
     content = [par.text for par in soup.find('div', {'class':'desc'}).findAll('p')]
    # Join to string & extract all of the unicode characters
     content = [word for word in re.findall(r'\w+', ' '.join(content)) if word not in stopwords.words('english')]
    # get counts of the output
     novel_data['content_counts'] = Counter(content).most_common(N_common)
  except:
     None
  time.sleep(30)
  return novel_data

In [None]:
novels = []

# Thread the grabbing of data
with cp.ThreadPoolExecutor() as ex:
  # We iterate over links instead of using map to avoid data loss if connection fails
  futures = []
  for link in novel_links:
    futures.append(ex.submit(get_link, link))

  for future in cp.as_completed(futures):
    novels.append(future.result())


In [None]:
df = pd.DataFrame(novels)
df.head()

In [None]:
df.to_csv('/content/drive/MyDrive/light_novel_dataset.csv')