In [1]:
import pandas as pd
from bs4 import BeautifulSoup as soup
import requests
import re

In [2]:
def scrape_glassdoor_to_df(html_snippet, num_pages):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
    
    
    review_date = []
    rating = []
    cur_status = []
    tenure_yrs = []
    review_title = []
    review_date = []
    empl_position = []
    office_loc = []
    pros = []
    cons = []

    #Going through each page
    for i in range(1, num_pages+1):
        if i == 1:       # first page
            #making request and soupifying it
            r = requests.get(f'https://www.glassdoor.com/Reviews/{html_snippet}.htm', headers=headers)
            html_file = r.text
            html_soup = soup(html_file, 'lxml')
            #snatching up the ratings
            for text in html_soup.findAll('span', {'class': 'ratingNumber mr-xsm'}):
                rating.append(float(text.text))
            #grabbing the pros section for further analysis
            for text in html_soup.findAll('span', {'data-test': 'pros'}):
                pros.append(text.text.replace('\r\n-', '. '))
            #grabbing the cons section 
            for text in html_soup.findAll('span', {'data-test': 'cons'}):
                cons.append(text.text.replace('\r\n-', '. '))
            
            
            #snagging both the current employment status and their tenure and then separating them into their own lists
            tenure_unmod = [] #not super necessary to have with group above, could be confusing
            
            for text in html_soup.findAll('span', {'class':'pt-xsm pt-md-0 css-1qxtz39 eg4psks0'}):
                status_tenure = text.text.split(', ')
                if len(status_tenure) == 1:
                    status_tenure.append('')
                cur_status.append(status_tenure[0].replace(' Employee', ''))
                tenure_unmod.append(status_tenure[1])
            
            #replacing values to make the dataframe easier to read
            for string in tenure_unmod:
                string = re.sub(r' year[s]*', '', string) #Use regex to select year or years and replacing with empty string
                tenure_yrs.append(string.replace('more than ', '>').replace('less than ', '<'))
           
            #stealing review title for analysis
            for text in html_soup.findAll('a', {'class', 'reviewLink'}):
                review_title.append(text.text)
            
            #baggin' both the date of review and the employee position
            for text in html_soup.findAll('span', {'class': 'authorJobTitle middle common__EiReviewDetailsStyle__newGrey'}):
                pos_rev = text.text.split(' - ')
                review_date.append(pd.to_datetime(pos_rev[0]))
                empl_position.append(pos_rev[1])
            
            #snatch up office location. Had difficulty parsing without losing data, so transformed into string and used regex and split() method
            loc_mess = []
            
            for text in html_soup.findAll('span', {'class': 'authorInfo'}):
                loc_mess.append(text.text)
            for string in loc_mess:
                if re.search(r'\xa0in', string) == None:
                    office_loc.append('')
                else:
                    office_loc.append(str(string.split("\xa0in",1)[1]))

        else:
            r = requests.get(f'https://www.glassdoor.com/Reviews/{html_snippet}_P{i}.htm', headers=headers)
            html_file = r.text
            html_soup = soup(html_file, 'lxml')
            
            for text in html_soup.findAll('span', {'class': 'ratingNumber mr-xsm'}):
                rating.append(float(text.text))
            
            for text in html_soup.findAll('span', {'data-test': 'pros'}):
                pros.append(text.text.replace('\r\n-', '. '))
            
            for text in html_soup.findAll('span', {'data-test': 'cons'}):
                cons.append(text.text.replace('\r\n-', '. '))
                
            for text in html_soup.findAll('span', {'class':'pt-xsm pt-md-0 css-1qxtz39 eg4psks0'}):
                status_tenure = text.text.split(', ')
                if len(status_tenure) == 1:
                    status_tenure.append('')
                cur_status.append(status_tenure[0].replace(' Employee', ''))
                tenure_unmod.append(status_tenure[1])

            for string in tenure_unmod:
                string = re.sub(r' year[s]*', '', string) #Use regex to select year or years and replacing with empty string
                tenure_yrs.append(string.replace('more than ', '>').replace('less than ', '<'))
            
            for text in html_soup.findAll('a', {'class', 'reviewLink'}):
                review_title.append(text.text)
                
            for text in html_soup.findAll('span', {'class': 'authorJobTitle middle common__EiReviewDetailsStyle__newGrey'}):
                pos_rev = text.text.split(' - ')
                review_date.append(pd.to_datetime(pos_rev[0]))
                empl_position.append(pos_rev[1])
            
            loc_mess = []
            for text in html_soup.findAll('span', {'class': 'authorInfo'}):
                loc_mess.append(text.text)
            for string in loc_mess:
                if re.search(r'\xa0in', string) == None:
                    office_loc.append('')
                else:
                    office_loc.append(string.split("\xa0in",1)[1])
                    
    return pd.DataFrame(zip(review_date, rating, cur_status, empl_position, tenure_yrs, office_loc, review_title, pros, cons), columns=['review_date', 'rating', 'cur_status', 'empl_position', 'tenure_yrs', 'office_loc', 'review_title', 'pros', 'cons'])

In [3]:
df = scrape_glassdoor_to_df('Adobe-Reviews-E1090', 10)

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   review_date    100 non-null    datetime64[ns]
 1   rating         100 non-null    float64       
 2   cur_status     100 non-null    object        
 3   empl_position  100 non-null    object        
 4   tenure_yrs     100 non-null    object        
 5   office_loc     100 non-null    object        
 6   review_title   100 non-null    object        
 7   pros           100 non-null    object        
 8   cons           100 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(7)
memory usage: 7.2+ KB
None


In [5]:
print(df.head())

  review_date  rating cur_status                     empl_position tenure_yrs  \
0  2022-07-05     5.0     Former  Sales Development Representative         <1   
1  2022-07-03     5.0    Current                 Software Engineer         <1   
2  2022-06-29     5.0    Current            Senior Project Manager         <1   
3  2022-06-30     5.0    Current        Senior Business Consultant              
4  2022-06-26     5.0    Current                       UX Designer         >1   

         office_loc                                       review_title  \
0   Los Angeles, CA                         Awesome culture and people   
1      New York, NY  Adobe is a company that appreciates their empl...   
2        Dallas, TX                                        Great Place   
3       Raleigh, NC                                Great place to work   
4          Lehi, UT             Loved it, just didn't get paid enough.   

                                                pros  \
0  Managers 