# 1. For this customer sentiment analysis project, i will try to build my own dataset by scraping product reviews from Flipkart website.

# 2. This Dataset will be consisting following columns

### 1. product id
### 2. product name
### 3. brand name
### 4. product category
### 5. price
### 6. qnt sold
### 7. product url
### 8. customer name
### 9. purchase date
### 10. customers city
### 11. coordinates of customers city
### 12. address of customer
### 13. rating
### 14. comment head
### 15. comment text

# 3. Importing libraries

In [1]:
import math
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import urllib.request
import os
import numpy as np
import random

# 4. Create required Directory if not exist

In [2]:
if not os.path.exists(r"D:\flipkart reviews\images"):
    os.makedirs(r"D:\flipkart reviews\images")
if not os.path.exists(r"D:\flipkart reviews\reviews"):
    os.makedirs(r"D:\flipkart reviews\reviews")
if not os.path.exists(r"D:\flipkart reviews\product_urls"):
    os.makedirs(r"D:\flipkart reviews\product_urls")
if not os.path.exists(r"D:\flipkart reviews\all csv combine"):
    os.makedirs(r"D:\flipkart reviews\all csv combine")

# 4. create_csv() Function will create empty csv file for each product scraped. csv will be created at "D:\flipkart reviews\reviews"

In [3]:
def create_csv(file_name):
    df = pd.DataFrame(columns=[
        'prod_id',
        'product_name',
        'brand_name',
        'category',
        'price',
        'sold',
        'prod_url',
        'customer_name',
        'purchase_date',
        'customers_city',
        'rating',
        'comment_head',
        'comment'
    ])
    location = r'D:\flipkart reviews\reviews\{file}.csv'.format(file = file_name)
    df.to_csv(location , mode="w+", index=False)
    return location

# 5. scrape_urls() function will scrape all the product URLs from given range of pages and will save them into csv file named as searched product. initialy scraped status will be 'not scraped' for each url.

In [4]:
def scrape_urls(base_url , search):
    url = base_url + search
    url_location = r'D:\flipkart reviews\product_urls\{file}.csv'.format(file = search)

    if not os.path.exists(url_location):
        list_of_url = []
        for prod_page in range(1, last_page+1):
            URL3 = f"{url}&page={prod_page}"
            headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
            # Make a GET request to the URL
            response = requests.get(url=URL3, headers=headers)
            # Parse the HTML of the page
            soup = BeautifulSoup(response.content, 'html.parser')
            # Scraping main boxes of reviews
            products = soup.findAll('div', {'class': "_1AtVbE col-12-12"})
            products_on_one_page = ["https://www.flipkart.com" + product.div.div.a['href'] for product in products[2:24]]
            list_of_url.extend(products_on_one_page)
            print(f"{search} urls are scraped for page no {prod_page}")
        response.close()
        All_urls = list(set(list_of_url))
        url_data = {"prod_links":All_urls,"scraped_status":"not scraped"}
        df_urls = pd.DataFrame(url_data)
        df_urls.to_csv(url_location , mode='w+', index=False)
        print(f"All urls are scraped for product: {search}")
    else:
        print(f"All urls are scraped for product: {search}")

# 6. remove_char() function will remove unwanted characters that are not supported by windows file name system.

In [5]:
def remove_char(string):
    unwanted_char = ['\\' , '/' , ':' , '*' , '?' , '"' , '<' , '>' , '|']
    new_string = ''
    for i in string:
        if i in unwanted_char:
            new_string += ' '
        else:
            new_string += i
    new_string = new_string.replace('  ', ' ').replace('   ', ' ').strip()
    return new_string

# 7. "get_reviews()" function is the main function that scrape product reviews.

### 7.1. It takes what product catagory do you want to scrape? and  location of the product url csv file(scraped by scrape_urls() function.) as a arguments.
### 7.2. It will scrape all the images and products information of searched category for eg. refrigerator.
### 7.3.  All this information will be saved in csv file in location D:\flipkart reviews\reviews.
### 7.4. After one product is scraped successfully the scraped status will be changed to 'scraped' from 'not scraped' in url csv file, so i can keep adding new products data without repeating already scraped producst

In [6]:
def get_reviews(search,url_csv_loc):
    
    df = pd.read_csv(url_csv_loc)
    total = len(df)
    df = df[['prod_links','scraped_status']][df['scraped_status']=='not scraped']
    url_list = list(df['prod_links'])
    if len(url_list) == 0:
        print(f"product category: {search} | products scraped: {total} | status: successful")
    else:
        for Id ,url in enumerate(url_list):
            try:
                # Going to product Reviews section 
                URL1 = url.replace('/p/', '/product-reviews/') + '&sortOrder=MOST_RECENT'
                # Here the user agent is for Edge browser on windows 10.
                headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
                # Make a GET request to the URL
                response = requests.get(url=URL1, headers=headers)
                # Parse the HTML of the page
                soup = BeautifulSoup(response.content, 'html.parser')
                # Scraping name, brand name, price, image and qnt sold before entering into customers comments
                prodid = str(search[:5].strip().replace(" ","")) + str(random.randint(1000, 9999))
                prod_name = str(soup.find('div', {'class': "_2s4DIt _1CDdy2"}).text.replace("Reviews", "").strip())
                prod_file_name = remove_char(prod_name)
                image = soup.find("img", {'class': "_396cs4"})
                image_src = image['src']
                image_location = r'D:\flipkart reviews\images\{image}.jpg'.format(image = prod_file_name)
                urllib.request.urlretrieve(image_src, image_location)
                brnd_name = str(prod_name).split()[0]
                prod_price = int(str(soup.find('div', {'class': "_30jeq3"}).text).replace(',' , '')[1:])
                total_reviews = int(str(soup.find('span', {'class': "_2_R_DZ"}).text).split()[3].replace(',',''))
                total_pages_of_reviews = math.ceil(total_reviews / 10)
                if total_pages_of_reviews > 500:
                    total_pages_of_reviews = 500
                else:
                    pass
            except Exception as e:
                prodid = np.nan
                prod_name = np.nan
                prod_file_name = np.nan
                brnd_name = np.nan
                prod_price = np.nan
                total_reviews = 0
                total_pages_of_reviews = 1
                
            # calling create_csv function
            location = create_csv(prod_file_name)

            # Itering through each page of review
            N = 0
            for page in range(1, total_pages_of_reviews+1):
                # sleeping in-between each 300 pages to go easy on flipkart
                if page in [300,600,900,1200,1500]:
                    print("waiting for 1 minute after every 300 requests, to avoid error from flipkart")
                    time.sleep(60)
                else:
                    pass

                try:
                    URL2 = url.replace('/p/', '/product-reviews/') + '&sortOrder=MOST_RECENT' + f"&page={page}"
                    # Here the user agent is for Edge browser on windows 10.
                    headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
                    # Make a GET request to the URL
                    response = requests.get(url=URL2, headers=headers)
                    # Parse the HTML of the page
                    soup = BeautifulSoup(response.content, 'html.parser')

                    # Scraping main boxes of reviews
                    reviews = soup.find_all('div', {'class': "_27M-vq"})

                    # Scraping required data
                    prod_id = [prodid for review in reviews]
                    product_name = [prod_name for review in reviews]
                    brand_name = [brnd_name for review in reviews]
                    category = [search for review in reviews]
                    price = [prod_price for review in reviews]
                    sold = [1 for review in reviews]
                    prod_url = [url for review in reviews]
                    customer_name = [review.div.div.find('p', {'class': '_2sc7ZR _2V5EHH'}).text for review in reviews]
                    purchase_date = [review.div.div.find_all('p', {'class': '_2sc7ZR'})[1].text for review in reviews]
                    customers_city = [review.div.div.find('p', {'class': '_2mcZGG'}).text.replace("Certified Buyer, ", "") for review in reviews]
                    rating = [review.div.div.div.div.text for review in reviews]
                    comment_head = [review.div.div.div.p.text for review in reviews]
                    comment = [review.div.div.find('div', {'class': ''}).div.text for review in reviews]

                except Exception as e:
                    prod_id = [np.nan for review in reviews]
                    product_name = [np.nan for review in reviews]
                    brand_name = [np.nan for review in reviews]
                    category = [np.nan for review in reviews]
                    price = [np.nan for review in reviews]
                    sold = [np.nan for review in reviews]
                    prod_url = [np.nan for review in reviews]
                    customer_name = [np.nan for review in reviews]
                    purchase_date = [np.nan for review in reviews]
                    customers_city = [np.nan for review in reviews]
                    rating = [np.nan for review in reviews]
                    comment_head = [np.nan for review in reviews]
                    comment = [np.nan for review in reviews]


                # After scraping through the all pages i found that there are some empty pages with no reviews thats why,
                if len(reviews) == 0:
                    pass
                else:
                    data = {
                        "prod_id":prod_id,
                        "product_name": product_name,
                        "brand_name": brand_name,
                        "category": category,
                        "price": price,
                        "sold" : sold,
                        "prod_url":prod_url,
                        "customer_name": customer_name,
                        "purchase_date": purchase_date,
                        "customers_city": customers_city,
                        "rating": rating,
                        "comment_head": comment_head,
                        "comment": comment
                    }

                    df = pd.DataFrame(data)
                    df.to_csv(location , mode='a', index=False, header=False)
                    N += len(rating)



            df = pd.read_csv(url_csv_loc)
            df['scraped_status'][df['prod_links']== url] = 'scraped'
            df.to_csv(url_csv_loc, index=False)
                     
            
        print(f"product category: {search} | products scraped: {total} | status: successful")

# 8. Defining Base Flipkart URL

In [7]:
base_url = "https://www.flipkart.com/search?q="

# 9. How many pages of product search do you want to scrape? Enter here.

In [8]:
last_page = 20

# 10. Calling function scrape_urls() and passing our search categories.

In [36]:
product_search = ["refrigerator", "washing machine", "air conditioner", "water purifier", "television", "laptop", "smartphone"]

In [37]:
for search in product_search:
    scrape_urls(base_url , search)

All urls are scraped for product: refrigerator
All urls are scraped for product: washing machine
All urls are scraped for product: air conditioner
All urls are scraped for product: water purifier
All urls are scraped for product: television
All urls are scraped for product: laptop
All urls are scraped for product: smartphone


# 11. We got all the products urls with respect to their category and they are stored in csv files at location "D:\flipkart reviews\product_urls"

In [38]:
path = r"D:\flipkart reviews\product_urls"
files = os.listdir(path)
files

['air conditioner.csv',
 'laptop.csv',
 'refrigerator.csv',
 'smartphone.csv',
 'television.csv',
 'washing machine.csv',
 'water purifier.csv']

# 12. Passing urls into function get_reviews(). This function only scrape those urls where "scraped_status" =  "not scraped". After product is scraped status will be changed to "scraped".

### 12.1. Scraping product data from urls in "air conditioner.csv"

In [10]:
url_csv_loc = r"D:\flipkart reviews\product_urls\air conditioner.csv"
search = "air conditioner.csv"

In [11]:
get_reviews(search , url_csv_loc)

product category: air conditioner.csv | products scraped: 440 | status: successful


### 12.2. Scraping product data from urls in "laptop.csv"

In [12]:
url_csv_loc = r"D:\flipkart reviews\product_urls\laptop.csv"
search = "laptop"

In [13]:
get_reviews(search , url_csv_loc)

product category: laptop | products scraped: 440 | status: successful


### 12.3. Scraping product data from urls in "refrigerator.csv"

In [14]:
url_csv_loc = r"D:\flipkart reviews\product_urls\refrigerator.csv"
search = "refrigerator"

In [15]:
get_reviews(search , url_csv_loc)

product category: refrigerator | products scraped: 440 | status: successful


### 12.4 Scraping product data from urls in "television.csv"

In [9]:
url_csv_loc = r"D:\flipkart reviews\product_urls\television.csv"
search = "television"

In [10]:
get_reviews(search , url_csv_loc)

product category: television | products scraped: 382 | status: successful


### 12.5. Scraping product data from urls in ''smartphone.csv''

In [24]:
url_csv_loc = r"D:\flipkart reviews\product_urls\smartphone.csv"
search = "smartphone"

In [25]:
get_reviews(search , url_csv_loc)

product category: smartphone | products scraped: 418 | status: successful


### 12.6 Scraping product data from urls in 'water purifier.csv'

In [19]:
url_csv_loc = r"D:\flipkart reviews\product_urls\water purifier.csv"
search = "water purifier"

In [20]:
get_reviews(search , url_csv_loc)

product category: water purifier | products scraped: 440 | status: successful


### 12.7 Scraping product data from urls in 'washing machine.csv'

In [9]:
url_csv_loc = r"D:\flipkart reviews\product_urls\washing machine.csv"
search = "washing machine"

In [None]:
get_reviews(search , url_csv_loc)

# 13. After repeating code for each category like "Refrigerator", "Air conditioner", "laptop"... Lets See What files we got!

In [10]:
path = r"D:\flipkart reviews\reviews"

In [11]:
files = os.listdir(path)
files[0:5]

['acer Aspire 3 Core i5 11th Gen - (8 GB 512 GB SSD Windows 11 Home) A315-58 Thin and Light Laptop.csv',
 'acer Aspire 3 Dual Core 3020e - (4 GB 256 GB SSD Windows 11 Home) A314-22 Laptop.csv',
 'acer Aspire 3 Pentium Silver - (4 GB 256 GB SSD Windows 11 Home) A314-35 Notebook.csv',
 'acer Aspire 3 Pentium Silver - (8 GB 256 GB SSD Windows 11 Home) A314-35 Notebook.csv',
 'acer Aspire 5 Core i3 12th Gen - (8 GB 512 GB SSD Windows 11 Home) A515-57 Thin and Light Laptop.csv']

In [12]:
len(files)

1737

# 14. Lets concat all csv into one dataframe

In [None]:
df = pd.DataFrame()
for file in files:
    current_df = pd.read_csv(r'{path}\{file}'.format(path=path, file = file))
    df = pd.concat([df , current_df])

In [25]:
df.head(20)

Unnamed: 0,prod_id,product_name,brand_name,category,price,sold,prod_url,customer_name,purchase_date,customers_city,rating,comment_head,comment
0,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Flipkart Customer,4 days ago,Karimpur,5,Great product,Very good product 🙂🎈🎈🎈🎈🎈
1,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Flipkart Customer,4 days ago,Ranchi,4,Really Nice,nice
2,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Dhiraj Jaiswal,5 days ago,Sidhi,3,Does the job,Good
3,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Vasamsetti Durgayya,11 days ago,Hyderabad,5,Best in the market!,Good product at this price.i am very happy.per...
4,lapto6288,acer Aspire 3 Core i5 11th Gen - (8 GB/512 GB ...,acer,laptop,44999,1,https://www.flipkart.com/acer-aspire-3-core-i5...,Flipkart Customer,14 days ago,Bengaluru,1,Did not meet expectations,Worst laptop don't buy this laptop


In [26]:
df.shape

(939543, 13)

# 15. we got 0.9 million reviews in total for different categories

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 939543 entries, 0 to 9
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   prod_id         918543 non-null  object
 1   product_name    918543 non-null  object
 2   brand_name      918543 non-null  object
 3   category        937632 non-null  object
 4   price           918543 non-null  object
 5   sold            937632 non-null  object
 6   prod_url        937632 non-null  object
 7   customer_name   936866 non-null  object
 8   purchase_date   937632 non-null  object
 9   customers_city  937632 non-null  object
 10  rating          937632 non-null  object
 11  comment_head    937632 non-null  object
 12  comment         937379 non-null  object
dtypes: object(13)
memory usage: 100.4+ MB


# 16. Lets see how many reviews we got for each category

In [28]:
df['category'].value_counts()

smartphone         514976
washing machine    106639
water purifier     104974
television          80576
refrigerator        63699
air conditioner     42466
laptop              24302
Name: category, dtype: int64

# 17. saving data into csv

In [42]:
df.to_csv(r"D:\flipkart reviews\all csv combine\raw_reviews.csv" , mode="w+", index=False)