In [None]:
# Import libraries
import os
from os.path import basename
import requests
from bs4 import BeautifulSoup 
import pandas as pd
import numpy as np
import re
from datetime import datetime
import tzlocal
from IPython.display import Image, HTML
import time
from time import sleep
from random import uniform
import json

In [None]:
# API scraper

# Parameters for API scraper - update as required

USER_ID = 341972726 # Sharechat user id

PASSCODE = "dcdb36a4dc99d6a39547" # inspect page > network > bucketFeed or requestType81 > headers > request payload > passcode

# Tag specific params from sharechat.com/tag > inspect ... > request payload 
tag_dict = {
    "trending/Hindi": {
        "tag_body": {
            "bn":"broker3","userId": USER_ID,"passCode": PASSCODE,
                        "client":"web","message":{
                            "r":"web", "f": 0, "p":"f"}},
        "api_url" : "https://restapi1.sharechat.com/requestType81"},
    "topic/whatsapp-hindi-238": {
        "tag_body": {
            "bn":"broker3","userId": USER_ID,"passCode": PASSCODE,
                        "client":"web","message":{
                            "b":238,"allowOffline":True}},
        "api_url": "https://restapi1.sharechat.com/bucketFeed"},
    "topic/news-hindi-125": {
        "tag_body": {
            "bn":"broker3","userId": USER_ID,"passCode": PASSCODE,
                        "client":"web","message":{
                            "b":238,"allowOffline":True}},
        "api_url": "https://restapi1.sharechat.com/bucketFeed"}}

d = os.getcwd() # Download destination

# Set timezone
local_timezone = tzlocal.get_localzone()

# Tags to scrape
t = ["topic/news-hindi-125", "topic/whatsapp-hindi-238", "trending/Hindi"]

# Number of pages to scrape
n = 10

# Helper functions for API scraper 

# Scrapes data from specified tags
def get_data(tags, pages):
    # Create empty dataframe to collect scraped data
    df = pd.DataFrame(columns = ["link", "timestamp", "lang", 
                                   "media_type", "tag", "thumbnail"])
    print("Scraping data from Sharechat ...")
    for _ in range(pages):
        # Scrape data from each tag
        for tag in tags: 
            # Get tag specific API access params
            url = tag_dict[tag]["api_url"]
            body = tag_dict[tag]["tag_body"]
            headers = {"content-type": "application/json", 
                           "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"} 
            
            response = requests.post(url, json=body, headers=headers)
            response_dict = json.loads(response.text)
            
            link, timestamp, lang, media_type = get_payload_data(response_dict)
            tag_data = pd.DataFrame(np.column_stack([link, timestamp, lang, media_type]), 
                            columns = ["link", "timestamp", "lang", "media_type"])
            
            # Add tag column 
            tag_data["tag"] = tag
            # Add thumbnail column
            tag_data["thumbnail"] = tag_data["link"]
            # Add tag data 
            df = df.append(tag_data)  
            time.sleep(uniform(5, 10)) # random delay after each API request
    df["timestamp"] = df["timestamp"].apply(lambda x: datetime.fromtimestamp(int(x), local_timezone).strftime("%d-%m-%Y, %H:%M:%S"))
    df.drop_duplicates(inplace = True)
    return df

# Gets image/video data from scraped payload 
def get_payload_data(payload_dict):
    link = []
    timestamp = []
    lang = []
    media_type = []
    for i in payload_dict["payload"]["d"]:
        if (i["t"] == "image") | (i["t"] == "video"):
            timestamp.append(i["o"])
            lang.append(i["m"])
            media_type.append(i["t"])
            if i["t"] == "image":
                link.append(i["g"])
            else:
                link.append(i["v"])
        else:
            pass # skip other content formats
    return link, timestamp, lang, media_type

#Converts links to thumbnails in html
def convert_links_to_thumbnails(df):   
    def path_to_image_html(path):
        return '<img src="'+ path + '"width="200" >' 
    image_df = df[df["media_type"] == "image"]
    pd.set_option('display.max_colwidth', -1)
    data_html = HTML(image_df.to_html(escape=False ,formatters=dict(thumbnail=path_to_image_html))) 
    return data_html

# Saves data in csv and html formats
def save_data(df, html):
    with open("sharechat_data_preview.html", "w") as f:
        f.write(html.data)
    df.drop("thumbnail", axis = 1, inplace = True)
    df.to_csv("sharechat_data.csv")
    
# Define API scraper 
def sharechat_scraper(tags, destination, pages):
    start_time = time.time()
    # Scrape data from each tag
    sharechat_df = get_data(tags, n)
    # Generate html file with image thumbnails
    sharechat_data_html = convert_links_to_thumbnails(sharechat_df)
    # Save data 
    save_data(sharechat_df, sharechat_data_html)
    print("{} posts scraped".format(len(sharechat_df)))
    print("Data saved to", destination)
    print("Time taken: %s seconds" % (time.time() - start_time))

In [None]:
# Run API scraper
sharechat_scraper(t, d, n)

In [None]:
# Beautiful Soup scraper

# Define scraper arguments
t = ["topic/news-hindi-125", "topic/whatsapp-hindi-238", "trending/Hindi"] # tags to scrape
d = os.getcwd() # download destination

# Define helper functions for Beautiful Soup scraper

# Scrapes data from specified tags
def get_data(tags):
    # Create empty dataframe to collect scraped data
    data = pd.DataFrame(columns = ["img_link", "timestamp", "tag", "thumbnail"])
    # Scrape data from each tag
    for tag in tags: 
        print("Scraping recent images from https://sharechat.com/"+tag+" ...")
        soup = get_parsed_page(tag) 
        img_links, timestamps = get_images_with_timestamps(soup)  
        # Save tag data as dataframe
        tag_data = pd.DataFrame(np.column_stack([img_links, timestamps]), 
                            columns = ["img_link", "timestamp"])
        # Add tag column 
        tag_data["tag"] = tag
        # Add thumbnail column
        tag_data["thumbnail"] = tag_data["img_link"]   
        # Add tag data 
        data = data.append(tag_data)
    return data

# Returns parsed web page
def get_parsed_page(tag):
    r = requests.get("https://sharechat.com/"+tag)
    c = r.content
    soup = BeautifulSoup(c, "lxml") 
    return soup

# Returns images with timestamps
def get_images_with_timestamps(soup):
    # Initialize empty lists to hold data
    img_links = []
    timestamps = []
    # Find image
    images = soup.findAll("img", {"src": re.compile(".jpg")}) 
    for image in images:
        # Add image link
        img_links.append(image["src"]) 
        # Find timestamp
        unix_ts = re.findall("\d{13}", image["src"]) 
        if (len(unix_ts) > 0): # If link contains timestamp
        # Reformat and save timestamp
            local_time = datetime.fromtimestamp(int("".join(unix_ts))/1000, local_timezone).strftime("%d:%m:%Y, %H:%M:%S")
            timestamps.append(local_time) 
        else:
            timestamps.append(None)
    return img_links, timestamps

# Adds image thumbnails for quick viewing
def convert_links_to_thumbnails(df):   
    def path_to_image_html(path):
        return '<img src="'+ path + '"width="200" >' 
    data_html = HTML(df.to_html(escape=False ,formatters=dict(thumbnail=path_to_image_html))) 
    return data_html

# Saves scraped data in csv and html formats
def save_data(df, html):
    with open("sharechat_data_html.html", "w") as f:
        f.write(html.data)
    df.drop("thumbnail", axis = 1, inplace = True)
    df.to_csv("sharechat_data.csv")
    
# Define Beautiful Soup scraper 
def sharechat_soup_scraper(tags, destination):
    # Scrape data from specified tags
    sharechat_df = get_data(tags)
    # Generate html file with image thumbnails
    sharechat_data_html = convert_links_to_thumbnails(sharechat_df)
    # Save data 
    save_data(sharechat_df, sharechat_data_html)
    print("{} images scraped".format(len(sharechat_df)))
    print("Data saved to", destination)
    

In [None]:
# Run Beautiful Soup scraper
sharechat_soup_scraper(t, d)