In [2]:
import requests
from bs4 import BeautifulSoup
from html.parser import HTMLParser
import re
import pandas as pd
import numpy as np
from collections import defaultdict
import time
import pickle

from IPython.display import clear_output

In [3]:
# Functions for getting links

def get_drug_exp_list_url(soup):
    """
    Gets the 'Next' link for a drug_experience page
    if it exists. Tacks on extra 0s to the max display
    to show all links. Use to display and collect all
    links for a drug/experience combo.
    """
    
    next_page_exists = soup.find(alt='next')
    
    if next_page_exists is not None:
        list_url_original = soup.find(alt='next').find_previous('a')['href']
        list_url = list_url_original.split('Start')[0] + 'Start=0&Max=10000' # set the range to display all links
        return 'https://erowid.org/' + list_url
    else:
        return None

def get_drug_exp_links(soup):
    """
    Gets list of links to actual experience reports
    """
    exp_links = [link['href'] for link in soup.find_all('a') if 'exp.php' in str(link)]
    
    return exp_links

In [4]:
# Function for souping a page

def get_soup(url):
    headers = requests.utils.default_headers()
    response = requests.get(url, headers = headers)
    try:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    except:
        print('Denied! Response code: ', str(response.status_code))

In [5]:
# Functions for scraping text and other fields from experience pages

def get_title(soup):
    """
    Get report title
    """
    try:
        title = soup.find('title').text.split('-')[2].strip("''").strip(" '")
    except:
        title = None
    
    return title

def get_body(soup, demarc='xq'*50):
    """
    Add demarcators before and after text body,
    strip on the demarcators to isolate the body
    
    demarc: arbitrary string not present in the soup (default 'xq' * 50)
    """
    try:
        soup.find(text=re.compile('Start Body')).replace_with(demarc)
        soup.find(text=re.compile('End Body')).replace_with(demarc)
        body = soup.text.split(demarc)[1]
    except:
        body = None
    
    return body

def get_bodyweight(soup):
    """
    Get author body weight
    """
    try:
        weight_text = soup.find(class_='bodyweight-amount').text
        bodyweight = int(weight_text.split(' ')[0])
    except:
        bodyweight = np.nan
    
    return bodyweight

def get_age(soup):
    """
    Get author age
    """
    
    try:
        age = soup.find(class_='footdata').find(text=re.compile('Age at time of experience')).split(':')[1].strip()
        try:
            age = int(age)
        except:
            age = np.nan
    except:
        age = np.nan
    
    return age

    
def get_gender(soup):
    """
    Get author gender
    """
    
    g = ['Male', 'Female']
    try:
        gender_long = soup.find(class_='footdata').find(text=re.compile('Gender')).split(':')[1].strip()
        if gender_long in g:
            gender = gender_long[0]
        elif 'not' in gender_long.lower(): 
            gender = None # may be 'not given', 'not specified' -- set to None
        else:
            gender = gender_long # for non-binary/other genders
    except:
        gender = None
        
    return gender

def get_year(soup):
    """
    Get year experience report was submitted
    """
    try:
        year = soup.find(class_='footdata').find(text=re.compile('Exp Year')).split(':')[1].strip().replace(',', '')
        year = int(year)
    except:
        year = np.nan

    return year
        
def get_views(soup):
    """
    Get number of page views
    """
    
    try:
        views = soup.find(class_='footdata').find(text=re.compile('Views')).split(':')[1].strip().replace(',', '')
        views = int(views)
    except:
        views = np.nan
    
    return views

def get_exp_id(soup):
    """
    Get id of experience, use to control for duplicates
    """
    
    try:
        exp_id = soup.find(class_='footdata').find(text=re.compile('ExpID')).split(':')[1].strip()
        exp_id = int(exp_id)
    except:
        exp_id = np.nan
        
    return exp_id

In [6]:
# Main scraping script

main_url_prefix = 'https://www.erowid.org/experiences/subs/'

collected = ['Amphetamines', 'Belladonna', 'Cocaine', 'Datura', 'DMT', 'Ketamine', 
             'LSD', 'MDMA' 'Mushrooms', 'Nitrous_Oxide', 'Opiods', 'PCP', 'Peyote', 'Salvia_divinorum']
#drug_list = ['Amphetamines', 'Benzodiazepines', 'Belladonna', 'Cannabis', 'Cocaine',
#             'Datura', 'DMT', 'Ketamine', 'LSD', 'MDMA', 'Mushrooms', 'Nitrous_Oxide',
#             'Opioids', 'PCP', 'Peyote', 'Salvia_divinorum']

#exp_list = ['General', 'Bad_Trips', 'Glowing_Experiences', 'Mystical_Experiences']
drug_list = ['Cannabis']
exp_list = ['Glowing_Experiences', 'Mystical_Experiences']

### DRUG LOOP ###
for drug in drug_list:
    dicts = [] # list for collecting dictionaries
    #delays = [7, 4, 6, 2, 19, 10] # list of possible delay times after each beer scrape
    delays = [.5]
    
    ### EXPERIENCE TYPE LOOP ###
    for exp in exp_list:
        print('Now scraping ' + drug + ': ' + exp)
        
        exp_url_prefix = 'https://erowid.org/experiences/'
        main_url = main_url_prefix + 'exp_' + drug + '_' + exp + '.shtml'
        drug_exp_soup = get_soup(main_url)
        
        if drug_exp_soup.text[:9] == 'not found': # skip iteration if drug_exp category not present
            next
        else:
            exp_list_url = get_drug_exp_list_url(get_soup(main_url)) 
            if exp_list_url is None:
                exp_list_url = main_url # if no 'next' link at bottom of main, just use main
                exp_url_prefix = 'http://erowid.org/' # if getting links from main_url, no need for 'experiences'
                
            exp_links = get_drug_exp_links(get_soup(exp_list_url))
            
            count50 = 0
            save_count = 0
            
            ### EXPERIENCE REPORT LOOP ###
            #exp_links = exp_links[4:]
            for ix, link in enumerate(exp_links):
                count50 += 1
                exp_report_link = exp_url_prefix + link 
                exp_report_soup = get_soup(exp_report_link) # the actual experience report soup
                
                d = defaultdict() # for collecting attributes
                
                # Get body text and attributes
                d['body'] = get_body(exp_report_soup)
                d['drug'] = drug
                d['experience'] = exp
                d['title'] = get_title(exp_report_soup)
                d['bodyweight'] = get_bodyweight(exp_report_soup)
                d['age'] = get_age(exp_report_soup)
                d['gender'] = get_gender(exp_report_soup)
                d['year'] = get_year(exp_report_soup)
                d['views'] = get_views(exp_report_soup)
                d['exp_id'] = get_exp_id(exp_report_soup)
                
                dicts.append(d)
                    
                if count50 < 50 and (ix +1) != len(exp_links):
                    # Choose random delays from list for all scrapes not multiples of 50
                    delay = np.random.choice(delays)
                    print(ix)
                    
                elif (count50 == 50) or ((ix + 1) == len(exp_links)):
                    # After every 50th scrape, or if reached the last experience link, 
                    # increment the save_count, convert the dict to a dataframe and pickle. 
                    # Then take a minute delay to avoid getting booted off
                    count50 = 0
                    save_count += 1
                
                    experiences_df = pd.DataFrame(dicts)
                    dicts = []
                
                    pickle.dump(experiences_df, open('data/' + drug + '_' + exp + 
                                          '_' + str(save_count) + '_raw_df' + '.pkl', 'wb'))
                    
                    delay = 1
                    clear_output()
                    print('Scraped ' + str(save_count) + ' experience sets from ' + drug + ': ' + exp)
                
                if delay==1:
                    print('Taking break...')
                time.sleep(delay)

Now scraping Cannabis: Glowing_Experiences
Now scraping Cannabis: Mystical_Experiences


In [169]:
ix

3

In [181]:
drug_exp_soup.text

'\n\nThe Vaults of Erowid : 403 - Blocked\n\n\n403 Forbidden: Blocked\n\n\n\n  \nYour browser / IP address has been blocked from accessing erowid.org.\n\nYour IP address (your "IP"): 98.14.93.207 \n\n\nThere are several reasons this may have happened:\n\n\nYou or someone else coming from your IP might have tried to load too many pages in too short a time.\nYour IP might have tried to inappropriately access files indicative of probing.\nYou tried to use an offline viewer, spider, or scraper that accessed more then 5,000 pages in a single day.\n\n\nIf you would like to have your IP unblocked, please email us the following information and include an explanation if you have one:\n\n\n  I got a 403 Forbidden message and can\'t access erowid.org.\n  I think I might have been blocked because [YOUR EXPLANATION HERE]\n\n  \nHere\'s my info:\n\nI.P. Address: 98.14.93.207\nUser-Agent: python-requests/2.22.0\nDate: Thursday, 15-Aug-2019 17:14:08 PDT\nNotice: blackhole.erowid.org\n\n\n\n\n\nLast Mo

In [134]:
exp_links

['/experiences/exp.php?ID=89668']

In [156]:
exp

'Mystical_Experiences'

In [145]:
df = pd.read_pickle('data/PCP_Mystical_Experiences_1_raw_df.pkl')

In [146]:
df

Unnamed: 0,body,drug,experience,title,bodyweight,age,gender,year,views,exp_id
0,"\nPCP is, perhaps, one of the most troubling d...",PCP,Mystical_Experiences,Boom,150,,M,2007,26272,68668
1,"\nIn my long history of drug use, this experie...",PCP,Mystical_Experiences,I Am God!,120,,M,1995,54654,55358
2,\nThis is the story of two young boys out for ...,PCP,Mystical_Experiences,A Lesson in Karma,130,,M,2004,21755,41977


In [137]:
d

defaultdict(None,
            {'body': None,
             'drug': 'Peyote',
             'experience': 'Bad_Trips',
             'title': None,
             'bodyweight': nan,
             'age': nan,
             'gender': None,
             'year': nan,
             'views': nan,
             'exp_id': nan})

In [172]:
exp_report_link

'https://erowid.org/experiences/exp.php?ID=110580'

In [171]:
get_soup(exp_report_link)

KeyboardInterrupt: 

In [86]:
exp_list_url

'https://www.erowid.org/experiences/subs/exp_Belladonna_Mystical_Experiences.shtml'

In [87]:
exp_links

['/experiences/exp.php?ID=48411', '/experiences/exp.php?ID=17696']

In [139]:
exp_report_link

'http://erowid.org//experiences/exp.php?ID=89668'