In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
url = 'https://forums.edmunds.com/discussion/7526/general/x/midsize-sedans-2-0'
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")

In [3]:
# extract info from each comment and save to a dataframe
df = pd.DataFrame(columns=["Counter", "Date", "User", "Comment"])
total_comment = 10000 # input

counter = 1
while (counter <= total_comment):
    parents = soup.find_all("div", class_ = "Comment") # scrape all comments
    for item in parents:
        user = item.find("span", class_="Author").text # get userid
        date = item.find("span", class_="MItem DateCreated").find("time").attrs['title'] # get date
        comment = item.find("div", class_="Message userContent").text # get comment
        df = df.append({"Counter":counter,"Date":date, 'User':user,'Comment':comment},ignore_index=True) # append to dataframe
        counter += 1
    # move to next page
    next_button = soup.find("span", class_="BeforeCommentHeading")
    next_page_link = next_button.find("a",{"class":"Next"}).attrs['href']
    page = requests.get(next_page_link)
    soup = BeautifulSoup(page.text, 'html.parser')

In [4]:
df

Unnamed: 0,Counter,Date,User,Comment
0,1,"April 11, 2007 6:52PM",\nmotownusa,\nHi Pat:You forgot the Chrysler Sebring
1,2,"April 11, 2007 7:33PM",\nexshoman,\nI'm sure some folks would appreciate having ...
2,3,"April 12, 2007 6:51AM",\ntargettuning,\nYou can try to revive this topic but without...
3,4,"April 12, 2007 8:43AM",\npat,\nModel vs. model is exactly what we're here f...
4,5,"April 13, 2007 11:49AM",\nperna,\nThe Altima is my favorite of the bunch. It i...
...,...,...,...,...
9995,9996,"July 24, 2008 9:06AM",\nigozoomzoom,\nIt's quite possible that the 2010 Fusion/Mil...
9996,9997,"July 24, 2008 9:07AM",\nmoocow1,\nOf course plans don't mean reality. I expect...
9997,9998,"July 24, 2008 9:27AM",\nakirby,"\nThese aren't ""plans"" - the cars hit the fact..."
9998,9999,"July 24, 2008 9:33AM",\nthegraduate,"\nIn my head, a nameplate's sales are a namepl..."


## Data cleaning

Delete \n and any blank spaces from the dataframe

In [5]:
df_cleaned=df.copy()

num_comments=len(df)

for i in range(num_comments):
    df_cleaned['User'][i]=df_cleaned['User'][i][1:-1]
    df_cleaned['Comment'][i]=df_cleaned['Comment'][i][1:-1]

# Date to date format
df_cleaned['Date']=pd.to_datetime(df_cleaned['Date'])
    
df_cleaned

Unnamed: 0,Counter,Date,User,Comment
0,1,2007-04-11 18:52:00,motownusa,Hi Pat:You forgot the Chrysler Sebring
1,2,2007-04-11 19:33:00,exshoman,I'm sure some folks would appreciate having th...
2,3,2007-04-12 06:51:00,targettuning,You can try to revive this topic but without b...
3,4,2007-04-12 08:43:00,pat,Model vs. model is exactly what we're here for...
4,5,2007-04-13 11:49:00,perna,The Altima is my favorite of the bunch. It is ...
...,...,...,...,...
9995,9996,2008-07-24 09:06:00,igozoomzoom,It's quite possible that the 2010 Fusion/Milan...
9996,9997,2008-07-24 09:07:00,moocow1,Of course plans don't mean reality. I expect a...
9997,9998,2008-07-24 09:27:00,akirby,"These aren't ""plans"" - the cars hit the factor..."
9998,9999,2008-07-24 09:33:00,thegraduate,"In my head, a nameplate's sales are a nameplat..."


Some comments are only compassed by emojis which leads to empty comments once the text extraction is done. We eliminate these observations.

In [6]:
df_cleaned=df_cleaned[df_cleaned['Comment']!='']

In [7]:
df_cleaned.to_csv("edmunds_extraction.csv",index=False)