In [5]:
import json
import requests
from bs4 import BeautifulSoup

# Function to extract hashtags from the <meta property="og:description"> tag
def extract_hashtags(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        print(f"Fetching: {url}")
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise exception for HTTP errors
        print("Response received. Checking HTML content...")

        soup = BeautifulSoup(response.text, 'html.parser')
        meta_tag = soup.find("meta", {"property": "og:description"})
        
        if meta_tag and "content" in meta_tag.attrs:
            content = meta_tag["content"]
            print(f"Found meta tag content: {content}")
            hashtags = [word.strip("#") for word in content.split() if word.startswith("#")]
            print(f"Extracted hashtags: {hashtags}")
            return hashtags
        else:
            print("No meta tag with property='og:description' found.")
        return []
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return []

# Path to the JSON file
file_path = "output_by_month/2024-09.json"

# Load the JSON file
with open(file_path, 'r') as file:
    dataset = json.load(file)

# Access the first link in "VideoList"
if "VideoList" in dataset and len(dataset["VideoList"]) > 0:
    first_link = dataset["VideoList"][0]["Link"]
    print(f"Testing with the first link: {first_link}")

    # Extract hashtags for the first link
    hashtags = extract_hashtags(first_link)
    print(f"Hashtags for the first link: {hashtags}")
else:
    print("No valid links found in the dataset.")


Testing with the first link: https://www.tiktokv.com/share/video/7420516801865321735/
Fetching: https://www.tiktokv.com/share/video/7420516801865321735/
Response received. Checking HTML content...
No meta tag with property='og:description' found.
Hashtags for the first link: []
