In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

In [2]:
def scrape_page(page_number):
    url = f"https://www.trustpilot.com/review/www.fashionnova.com?page={page_number}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    reviews = []

    # Find all review containers
    review_divs = soup.find_all('div', class_='styles_reviewCardInner__EwDq2')

    for review_div in review_divs:
        # Extract reviewer name
        reviewer_name_tag = review_div.find('span', class_='typography_heading-xxs__QKBS8', attrs={"data-consumer-name-typography": "true"})
        reviewer_name = reviewer_name_tag.get_text(strip=True) if reviewer_name_tag else "Name not found"

        # Extract reviewer profile link
        reviewer_profile_link_tag = review_div.find('a', class_='link_internal__7XN06', attrs={"data-consumer-profile-link": "true"})
        reviewer_profile_link = reviewer_profile_link_tag['href'] if reviewer_profile_link_tag else "Profile link not found"

        # Extract reviewer country
        reviewer_country_tag = review_div.find('div', class_='typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l styles_detailsIcon__Fo_ua')
        reviewer_country = reviewer_country_tag.find('span').text if reviewer_country_tag else "Country not found"

        # Extract review count
        review_count_tag = review_div.find('span', class_='typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l', attrs={'data-consumer-reviews-count-typography': 'true'})
        review_count = review_count_tag.text.strip() if review_count_tag else "Review count not found"

        # Extract review date
        review_date_tag = review_div.find('time', attrs={'data-service-review-date-time-ago': 'true'})
        review_date = review_date_tag['datetime'] if review_date_tag else "Review date not found"

        # Extract rating
        rating_tag = review_div.find('div', class_='star-rating_starRating__4rrcf')
        rating = rating_tag.find('img')['alt'] if rating_tag else "Rating not found"

        # Extract review title
        review_title_tag = review_div.find('h2', class_='typography_heading-s__f7029')
        review_title = review_title_tag.text if review_title_tag else "Review title not found"

        # Extract review text
        review_text_tag = review_div.find('p', class_='typography_body-l__KUYFJ typography_appearance-default__AAY17 typography_color-black__5LYEn')
        review_text = review_text_tag.text if review_text_tag else "Review text not found"

        # Extract date of experience
        date_experience_tag = review_div.find('p', class_='typography_body-m__xgxZ_ typography_appearance-default__AAY17', attrs={'data-service-review-date-of-experience-typography': 'true'})
        date_experience = date_experience_tag.get_text(strip=True).split(':', 1)[-1].strip() if date_experience_tag else "Date of experience not found"

        reviews.append({
            "Reviewer Name": reviewer_name,
            "Profile Link": reviewer_profile_link,
            "Country": reviewer_country,
            "Review Count": review_count,
            "Review Date": review_date,
            "Rating": rating,
            "Review Title": review_title,
            "Review Text": review_text,
            "Date of Experience": date_experience
        })

    return reviews




In [6]:
def scrape_pages(start_page, end_page):
    all_reviews = []
    for page_number in range(start_page, end_page + 1):
        reviews = scrape_page(page_number)
        all_reviews.extend(reviews)

        # Delay after every 10 pages
        if page_number % 10 == 0:
            print(f"Scraped {page_number} pages. Waiting for 2 seconds...")
            time.sleep(2)

        # Delay after every 100 pages
        if page_number % 100 == 0:
            print(f"Scraped {page_number} pages. Waiting for 5 minutes...")
            time.sleep(300)

    return all_reviews

In [7]:
start_page = 1
end_page = 1000
frame1_reviews = scrape_pages(start_page, end_page)
df1 = pd.DataFrame(frame1_reviews)

Scraped 10 pages. Waiting for 2 seconds...
Scraped 20 pages. Waiting for 2 seconds...
Scraped 30 pages. Waiting for 2 seconds...
Scraped 40 pages. Waiting for 2 seconds...
Scraped 50 pages. Waiting for 2 seconds...
Scraped 60 pages. Waiting for 2 seconds...
Scraped 70 pages. Waiting for 2 seconds...
Scraped 80 pages. Waiting for 2 seconds...
Scraped 90 pages. Waiting for 2 seconds...
Scraped 100 pages. Waiting for 2 seconds...
Scraped 100 pages. Waiting for 5 minutes...
Scraped 110 pages. Waiting for 2 seconds...
Scraped 120 pages. Waiting for 2 seconds...
Scraped 130 pages. Waiting for 2 seconds...
Scraped 140 pages. Waiting for 2 seconds...
Scraped 150 pages. Waiting for 2 seconds...
Scraped 160 pages. Waiting for 2 seconds...
Scraped 170 pages. Waiting for 2 seconds...
Scraped 180 pages. Waiting for 2 seconds...
Scraped 190 pages. Waiting for 2 seconds...
Scraped 200 pages. Waiting for 2 seconds...
Scraped 200 pages. Waiting for 5 minutes...
Scraped 210 pages. Waiting for 2 seconds.

In [14]:
df1.shape

(20000, 9)

In [9]:
df1.to_csv('frame1.csv', index=False)

In [11]:
start_page = 1001
end_page = 2000
frame2_reviews = scrape_pages(start_page, end_page)
df2 = pd.DataFrame(frame2_reviews)

Scraped 1010 pages. Waiting for 2 seconds...
Scraped 1020 pages. Waiting for 2 seconds...
Scraped 1030 pages. Waiting for 2 seconds...
Scraped 1040 pages. Waiting for 2 seconds...
Scraped 1050 pages. Waiting for 2 seconds...
Scraped 1060 pages. Waiting for 2 seconds...
Scraped 1070 pages. Waiting for 2 seconds...
Scraped 1080 pages. Waiting for 2 seconds...
Scraped 1090 pages. Waiting for 2 seconds...
Scraped 1100 pages. Waiting for 2 seconds...
Scraped 1100 pages. Waiting for 5 minutes...
Scraped 1110 pages. Waiting for 2 seconds...
Scraped 1120 pages. Waiting for 2 seconds...
Scraped 1130 pages. Waiting for 2 seconds...
Scraped 1140 pages. Waiting for 2 seconds...
Scraped 1150 pages. Waiting for 2 seconds...
Scraped 1160 pages. Waiting for 2 seconds...
Scraped 1170 pages. Waiting for 2 seconds...
Scraped 1180 pages. Waiting for 2 seconds...
Scraped 1190 pages. Waiting for 2 seconds...
Scraped 1200 pages. Waiting for 2 seconds...
Scraped 1200 pages. Waiting for 5 minutes...
Scraped 12

In [12]:
df2.shape

(20000, 9)

In [15]:
start_page = 2001
end_page = 3000
frame3_reviews = scrape_pages(start_page, end_page)
df3 = pd.DataFrame(frame3_reviews)

Scraped 2010 pages. Waiting for 2 seconds...
Scraped 2020 pages. Waiting for 2 seconds...
Scraped 2030 pages. Waiting for 2 seconds...
Scraped 2040 pages. Waiting for 2 seconds...
Scraped 2050 pages. Waiting for 2 seconds...
Scraped 2060 pages. Waiting for 2 seconds...
Scraped 2070 pages. Waiting for 2 seconds...
Scraped 2080 pages. Waiting for 2 seconds...
Scraped 2090 pages. Waiting for 2 seconds...
Scraped 2100 pages. Waiting for 2 seconds...
Scraped 2100 pages. Waiting for 5 minutes...
Scraped 2110 pages. Waiting for 2 seconds...
Scraped 2120 pages. Waiting for 2 seconds...
Scraped 2130 pages. Waiting for 2 seconds...
Scraped 2140 pages. Waiting for 2 seconds...
Scraped 2150 pages. Waiting for 2 seconds...
Scraped 2160 pages. Waiting for 2 seconds...
Scraped 2170 pages. Waiting for 2 seconds...
Scraped 2180 pages. Waiting for 2 seconds...
Scraped 2190 pages. Waiting for 2 seconds...
Scraped 2200 pages. Waiting for 2 seconds...
Scraped 2200 pages. Waiting for 5 minutes...
Scraped 22

In [16]:
df3.shape

(20000, 9)

In [18]:
cdf1= pd.concat([df1, df2, df3], ignore_index=True)

In [20]:
cdf1.shape

(60000, 9)

In [21]:
cdf1.to_csv('reviews_1.csv', index=False)

In [22]:
start_page = 3001
end_page = 4000
frame4_reviews = scrape_pages(start_page, end_page)
df4 = pd.DataFrame(frame4_reviews)

Scraped 3010 pages. Waiting for 2 seconds...
Scraped 3020 pages. Waiting for 2 seconds...
Scraped 3030 pages. Waiting for 2 seconds...
Scraped 3040 pages. Waiting for 2 seconds...
Scraped 3050 pages. Waiting for 2 seconds...
Scraped 3060 pages. Waiting for 2 seconds...
Scraped 3070 pages. Waiting for 2 seconds...
Scraped 3080 pages. Waiting for 2 seconds...
Scraped 3090 pages. Waiting for 2 seconds...
Scraped 3100 pages. Waiting for 2 seconds...
Scraped 3100 pages. Waiting for 5 minutes...
Scraped 3110 pages. Waiting for 2 seconds...
Scraped 3120 pages. Waiting for 2 seconds...
Scraped 3130 pages. Waiting for 2 seconds...
Scraped 3140 pages. Waiting for 2 seconds...
Scraped 3150 pages. Waiting for 2 seconds...
Scraped 3160 pages. Waiting for 2 seconds...
Scraped 3170 pages. Waiting for 2 seconds...
Scraped 3180 pages. Waiting for 2 seconds...
Scraped 3190 pages. Waiting for 2 seconds...
Scraped 3200 pages. Waiting for 2 seconds...
Scraped 3200 pages. Waiting for 5 minutes...
Scraped 32

In [24]:
df4.shape

(19980, 9)

In [25]:
cdf2= pd.concat([df1, df2, df3,df4], ignore_index=True)

In [26]:
cdf2.shape

(79980, 9)

In [27]:
cdf2.to_csv('reviews_2.csv', index=False)

In [28]:
cdf2.shape

(79980, 9)

In [29]:
start_page = 4001
end_page = 5000
frame5_reviews = scrape_pages(start_page, end_page)
df5 = pd.DataFrame(frame5_reviews)

Scraped 4010 pages. Waiting for 2 seconds...
Scraped 4020 pages. Waiting for 2 seconds...
Scraped 4030 pages. Waiting for 2 seconds...
Scraped 4040 pages. Waiting for 2 seconds...
Scraped 4050 pages. Waiting for 2 seconds...
Scraped 4060 pages. Waiting for 2 seconds...
Scraped 4070 pages. Waiting for 2 seconds...
Scraped 4080 pages. Waiting for 2 seconds...
Scraped 4090 pages. Waiting for 2 seconds...
Scraped 4100 pages. Waiting for 2 seconds...
Scraped 4100 pages. Waiting for 5 minutes...
Scraped 4110 pages. Waiting for 2 seconds...
Scraped 4120 pages. Waiting for 2 seconds...
Scraped 4130 pages. Waiting for 2 seconds...
Scraped 4140 pages. Waiting for 2 seconds...
Scraped 4150 pages. Waiting for 2 seconds...
Scraped 4160 pages. Waiting for 2 seconds...
Scraped 4170 pages. Waiting for 2 seconds...
Scraped 4180 pages. Waiting for 2 seconds...
Scraped 4190 pages. Waiting for 2 seconds...
Scraped 4200 pages. Waiting for 2 seconds...
Scraped 4200 pages. Waiting for 5 minutes...
Scraped 42

In [30]:
df5.shape

(20000, 9)

In [31]:
start_page = 5001
end_page = 6000
frame6_reviews = scrape_pages(start_page, end_page)
df6 = pd.DataFrame(frame6_reviews)

Scraped 5010 pages. Waiting for 2 seconds...
Scraped 5020 pages. Waiting for 2 seconds...
Scraped 5030 pages. Waiting for 2 seconds...
Scraped 5040 pages. Waiting for 2 seconds...
Scraped 5050 pages. Waiting for 2 seconds...
Scraped 5060 pages. Waiting for 2 seconds...
Scraped 5070 pages. Waiting for 2 seconds...
Scraped 5080 pages. Waiting for 2 seconds...
Scraped 5090 pages. Waiting for 2 seconds...
Scraped 5100 pages. Waiting for 2 seconds...
Scraped 5100 pages. Waiting for 5 minutes...
Scraped 5110 pages. Waiting for 2 seconds...
Scraped 5120 pages. Waiting for 2 seconds...
Scraped 5130 pages. Waiting for 2 seconds...
Scraped 5140 pages. Waiting for 2 seconds...
Scraped 5150 pages. Waiting for 2 seconds...
Scraped 5160 pages. Waiting for 2 seconds...
Scraped 5170 pages. Waiting for 2 seconds...
Scraped 5180 pages. Waiting for 2 seconds...
Scraped 5190 pages. Waiting for 2 seconds...
Scraped 5200 pages. Waiting for 2 seconds...
Scraped 5200 pages. Waiting for 5 minutes...
Scraped 52

In [33]:
start_page = 6001
end_page = 6100
frame7_reviews = scrape_pages(start_page, end_page)
df7 = pd.DataFrame(frame7_reviews)

Scraped 6010 pages. Waiting for 2 seconds...
Scraped 6020 pages. Waiting for 2 seconds...
Scraped 6030 pages. Waiting for 2 seconds...
Scraped 6040 pages. Waiting for 2 seconds...
Scraped 6050 pages. Waiting for 2 seconds...
Scraped 6060 pages. Waiting for 2 seconds...
Scraped 6070 pages. Waiting for 2 seconds...
Scraped 6080 pages. Waiting for 2 seconds...
Scraped 6090 pages. Waiting for 2 seconds...
Scraped 6100 pages. Waiting for 2 seconds...
Scraped 6100 pages. Waiting for 5 minutes...


In [34]:
start_page = 6101
end_page = 6600
frame8_reviews = scrape_pages(start_page, end_page)
df8 = pd.DataFrame(frame8_reviews)

Scraped 6110 pages. Waiting for 2 seconds...
Scraped 6120 pages. Waiting for 2 seconds...
Scraped 6130 pages. Waiting for 2 seconds...
Scraped 6140 pages. Waiting for 2 seconds...
Scraped 6150 pages. Waiting for 2 seconds...
Scraped 6160 pages. Waiting for 2 seconds...
Scraped 6170 pages. Waiting for 2 seconds...
Scraped 6180 pages. Waiting for 2 seconds...
Scraped 6190 pages. Waiting for 2 seconds...
Scraped 6200 pages. Waiting for 2 seconds...
Scraped 6200 pages. Waiting for 5 minutes...
Scraped 6210 pages. Waiting for 2 seconds...
Scraped 6220 pages. Waiting for 2 seconds...
Scraped 6230 pages. Waiting for 2 seconds...
Scraped 6240 pages. Waiting for 2 seconds...
Scraped 6250 pages. Waiting for 2 seconds...
Scraped 6260 pages. Waiting for 2 seconds...
Scraped 6270 pages. Waiting for 2 seconds...
Scraped 6280 pages. Waiting for 2 seconds...
Scraped 6290 pages. Waiting for 2 seconds...
Scraped 6300 pages. Waiting for 2 seconds...
Scraped 6300 pages. Waiting for 5 minutes...
Scraped 63

In [35]:
df=pd.concat([df1, df2, df3, df4, df5, df6,df7,df8], ignore_index=True)
df.to_csv('fashionnova_reviews.csv', index=False)

In [36]:
df.shape

(131980, 9)

In [37]:
df.sample(10)

Unnamed: 0,Reviewer Name,Profile Link,Country,Review Count,Review Date,Rating,Review Title,Review Text,Date of Experience
23772,Nina Sanchez,/users/63e3c7fcf9dca00015f9f00a,US,1 review,2023-02-08T18:04:14.000Z,Rated 5 out of 5 stars,The website has great deals,Review text not found,"February 08, 2023"
99847,Teyana Hudson,/users/614a151187c9cb001243f2cd,US,1 review,2021-09-21T17:23:31.000Z,Rated 3 out of 5 stars,You guys need a better cancellation…,You guys need a better cancellation service.,"September 21, 2021"
22028,customer,/users/60bf7b67f78f7400197ca739,US,3 reviews,2023-02-28T20:06:58.000Z,Rated 5 out of 5 stars,Easy site to navigate,Review text not found,"February 28, 2023"
20917,Savoy Jenkins,/users/62fbc6437fcced0012d098ab,US,2 reviews,2023-03-14T09:03:30.000Z,Rated 5 out of 5 stars,I like fashion nova,I like fashion nova. Easy to look thru to find...,"March 14, 2023"
118420,Julian Bono,/users/60fed8eed22b22001257a37a,US,1 review,2021-07-26T15:46:54.000Z,Rated 5 out of 5 stars,Great,Great order!!!,"July 26, 2021"
67629,Priscilla Davis,/users/618ad332175ab400159163d6,US,1 review,2021-11-09T19:59:47.000Z,Rated 1 out of 5 stars,Hard time checking out,Review text not found,"November 09, 2021"
118341,Jizzel tingling,/users/60ff0287f54aad0012de388e,JM,1 review,2021-07-26T18:44:23.000Z,Rated 5 out of 5 stars,I love fn,I love me some nova,"July 26, 2021"
112750,Lashonda Patterson,/users/610c02b4f5beef0012e653e1,US,1 review,2021-08-05T15:24:36.000Z,Rated 5 out of 5 stars,Clothes,Great clothing,"August 05, 2021"
93358,Matiya Tate,/users/58d16b650000ff000a840412,US,4 reviews,2021-10-02T13:49:38.000Z,Rated 5 out of 5 stars,Where me and my bf get all our clothes…,Where me and my bf get all our clothes for goi...,"October 02, 2021"
67067,Jessica Dabrasky,/users/6198fc92ace1180012d8f7a7,US,2 reviews,2021-11-20T13:48:03.000Z,Rated 5 out of 5 stars,Easy and greet,Review text not found,"November 20, 2021"


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131980 entries, 0 to 131979
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Reviewer Name       131980 non-null  object
 1   Profile Link        131980 non-null  object
 2   Country             131980 non-null  object
 3   Review Count        131980 non-null  object
 4   Review Date         131980 non-null  object
 5   Rating              131980 non-null  object
 6   Review Title        131980 non-null  object
 7   Review Text         131980 non-null  object
 8   Date of Experience  131980 non-null  object
dtypes: object(9)
memory usage: 9.1+ MB


In [39]:
df.isna().sum()

Reviewer Name         0
Profile Link          0
Country               0
Review Count          0
Review Date           0
Rating                0
Review Title          0
Review Text           0
Date of Experience    0
dtype: int64

In [44]:
df[df.duplicated()]

Unnamed: 0,Reviewer Name,Profile Link,Country,Review Count,Review Date,Rating,Review Title,Review Text,Date of Experience
11140,Lashelle Foust,/users/64b81c2e4b86b400126ce1a8,US,1 review,2023-07-19T19:23:59.000Z,Rated 5 out of 5 stars,Easy to Order,It was easy! I found exactly what I was lookin...,"July 19, 2023"
12340,Cherry,/users/5f24ff83ed77f7732dcc5b58,US,1 review,2023-07-02T01:08:23.000Z,Rated 5 out of 5 stars,Love it found everything I need,Review text not found,"January 07, 2023"
34660,Antoinette Moore,/users/6385848b6b646e0012a74a31,US,1 review,2022-11-29T06:03:24.000Z,Rated 3 out of 5 stars,The site is very slow on a mobile…,The site is very slow on a mobile device. But ...,"November 28, 2022"
34800,Julie,/users/59b00ad30000ff000ac4be34,US,2 reviews,2022-11-29T04:52:07.000Z,Rated 5 out of 5 stars,Love clothes,Love clothes. Exactly what I was looking for,"November 28, 2022"
34960,Demorn Daniels,/users/6385526b6b646e0012a738a6,US,1 review,2022-11-29T02:29:33.000Z,Rated 5 out of 5 stars,Great prices and discounts,Review text not found,"November 28, 2022"
...,...,...,...,...,...,...,...,...,...
91004,Cat,/users/615daee01c8cd100128e414d,US,1 review,2021-10-06T14:12:49.000Z,Rated 1 out of 5 stars,I had to reorder this order because I…,I had to reorder this order because I could no...,"October 06, 2021"
91005,TONYA HATCH,/users/5bedb33ee35b961c918cb243,US,2 reviews,2021-10-06T14:11:30.000Z,Rated 5 out of 5 stars,First time shopping,Review text not found,"October 06, 2021"
91006,Kaitlyn Robinson,/users/615dae0cb5371d001231b084,US,1 review,2021-10-06T14:09:17.000Z,Rated 5 out of 5 stars,Review,I love it it fits great and I love the color o...,"October 06, 2021"
91460,April Marion,/users/57a152db0000ff000a3745e7,US,2 reviews,2021-10-05T18:01:33.000Z,Rated 5 out of 5 stars,This my second time ordering from this…,This my second time ordering from this site I ...,"October 05, 2021"
