In [40]:
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd
from collections import OrderedDict

HOST = 'https://bookmix.ru'
URL = 'https://bookmix.ru/comments/'
HEADERS = {
    'User-Agent': 'Here was my user-agent address'
}

In [115]:
# a func that gets url with user-agent and 0 parameters
def get_html(url, params=''):
  response = requests.get(url, headers=HEADERS, params=params)
  return response

# a func that acquires full version of reviews via "read more" links
def get_full_reviews(soup):
  all_comments_ids = []
  ids_only = []
  full_reviews = []

  # getting links to comments and removing the same ones with an ordered dict
  links = (link.get('href') for link in soup.findAll('a') if '/discussion.phtml?id=' in str(link.get('href')))
  links_unique = list(OrderedDict.fromkeys(links))
  links_unique = [i for i in links_unique if 'comment' in i]

  # getting all the #comment_ids
  for url in links_unique:
    all_comments_ids.append(re.search('#comment\d+', url)[0])

  # getting all the ids themselves
  for comment_id in all_comments_ids:
    ids_only.append(re.search('\d+', comment_id)[0])

  # extracting full-length book reviews based on their ids
  for link, comment_id in zip(links_unique, ids_only):
    response = get_html(HOST + link)
    soup = bs(response.content, 'lxml', from_encoding='utf-8')
    try:
        item = soup.find('div', class_="item-comment level1", id='comment'+comment_id).find(class_=
                                                 'comment-content').get_text(strip=True)
        full_reviews.append(item)
    except AttributeError or ChunkedEncodingError or ProtocolError or ValueError or IncompleteRead:
        pass
  return full_reviews

'''
a func that converts "rating stars" to labels (negative, neutral, positive) for negative (2 stars or lower),
neutral (3 stars), and positive (4 stars or higher) reviews respectively
'''
def get_sentiments(items):
  raw_data = []
  ratings = []
  sentiments = []
  for item in items:
    raw_data.append({
      'rating': item.find('div', {'class': ['rating', 'disabled', 'star[0-5]{1}']})
      })
  for i in raw_data:
    if (re.findall('[0-5]', str(i))) == []:
      ratings.append('0')
    else:
      ratings.append(re.findall('[0-5]', str(i))[0])

  for i in ratings:
    if int(i) > 3:
      sentiments.append('positive')
    elif int(i) == 3:
      sentiments.append('neutral')
    elif int(i) < 3:
      sentiments.append('negative')
  return sentiments

In [None]:
total_reviews = []
total_sentiments = []
'''
a cycle for iterating over the first 10 pages from "https://bookmix.ru/comments/" 
and getting both full reviews and sentiment labels 
\
since the website was crashing after approximately 500 pages (20 reviews each),
we had to parse all the 80 000 reviews in small batches, and put them together in a complete dataset afterwards
'''
for page in range(1, 501):
  print(f"Parsing page number {page}")
  URL = "https://bookmix.ru/comments/index.phtml?begin={}&num_point=20&num_points=20".format(page*20)
  response = get_html(URL)
  soup = bs(response.content, 'lxml', from_encoding='utf-8')
  items = soup.findAll('div', class_="universal-blocks")

  page_reviews = get_full_reviews(soup)
  page_sentiments = get_sentiments(items)

  total_reviews.extend(page_reviews)
  total_sentiments.extend(page_sentiments)
print("Parsing is finished.")

In [130]:
'''
writing the first batch of 500 pages of book reviews into a file with the respective name
'''
df = pd.DataFrame(list(zip(total_reviews, total_sentiments)), columns = ['Review', 'Sentiment'])
df.to_csv(r'Reviews1-500.csv', sep=',', encoding='utf-8', index = False)

In [None]:
import os
import glob
import pandas as pd
#set working directory
os.chdir("/Users/urijzuzaev/Desktop/reviews_dataset")

'''
finding all the batch names and writing them to a list
'''
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
print(all_filenames)

'''
finding all the batches parsed during the previous stage and combining them together into one dataset
'''
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv.to_csv("combined_dataset.csv", index=False, encoding='utf-8')