In [1]:
import os
import dotenv
dotenv.load_dotenv()
from apify_client import ApifyClient
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
# Create the project folder if it does not exist
PROJECT = "us-elections"
folder_path = os.path.join("results", PROJECT)
os.makedirs(folder_path, exist_ok=True)

# Define search parameters
# Key word search
RESULTS_PER_PAGE = 10
SEARCH_TERMS = [
    "Kamala",
    "VP",
    "KamalaHarris",
    "MAGA",
    "Trump",
    "realDonaldTrump",
    "Robert Kennedy",
    "RFK",
    "RobertKennedyJr",
    "RFKJr",
    "KennedyShanahan24",
    "Kennedy24",
    "Cornel West",
    "Dr. West",
    "CornelWest",
    "Jill Stein",
    "DrJillStein",
    "ChaseForLiberty",
]

# Profile search
NEWEST_POST_DATE = "2024-10-31"
OLDEST_POST_DATE = "2024-10-01"
PROFILES = [
    "kamalaharris",
    "realdonaldtrump",
    "robertfkennedyjrofficial",
    "brothercornelwest",
    "drjillstein",
    "chaseforliberty",
]

# Query Video Metadata based on Keyword Search and Profile

## Keyword Search

In [None]:
# Initialize the ApifyClient with your API token
client_keywordsearch = ApifyClient(os.getenv("APIFY_API"))

# Prepare the Actor input
run_input_keywordsearch = {
    "excludePinnedPosts": False,
    "resultsPerPage": RESULTS_PER_PAGE,
    "searchQueries": SEARCH_TERMS,
    "searchSection": "/video",
    "shouldDownloadCovers": False,
    "shouldDownloadSlideshowImages": False,
    "shouldDownloadSubtitles": False,
    "shouldDownloadVideos": False
}

# Run the Actor and wait for it to finish
run_keywordsearch = client_keywordsearch.actor("OtzYfK1ndEGdwWFKQ").call(run_input=run_input_keywordsearch)

new_video_metadata_keywordsearch = pd.DataFrame(list(client_keywordsearch.dataset(run_keywordsearch["defaultDatasetId"]).iterate_items()))

# Print response
print(new_video_metadata_keywordsearch.shape)
new_video_metadata_keywordsearch.head()

## Profile Search

In [None]:
# Initialize the ApifyClient with your API token
client_profilesearch = ApifyClient(os.getenv("APIFY_API"))

# Prepare the Actor input
run_input_profilesearch = {
    "excludePinnedPosts": False,
    "newestPostDate": NEWEST_POST_DATE,
    "oldestPostDate": OLDEST_POST_DATE,
    "profileScrapeSections": [
        "videos",
        "reposts"
    ],
    "profiles": PROFILES,
    "shouldDownloadCovers": False,
    "shouldDownloadSlideshowImages": False,
    "shouldDownloadSubtitles": False,
    "shouldDownloadVideos": False
}

# Run the Actor and wait for it to finish
run_profilesearch = client_profilesearch.actor("OtzYfK1ndEGdwWFKQ").call(run_input=run_input_profilesearch)

new_video_metadata_profilesearch = pd.DataFrame(list(client_profilesearch.dataset(run_profilesearch["defaultDatasetId"]).iterate_items()))
new_video_metadata_profilesearch.rename(columns={"input": "profile"}, inplace=True)

# Print response
print(new_video_metadata_profilesearch.shape)
new_video_metadata_profilesearch.head()

In [None]:
# Process new video metadata
new_video_metadata = pd.concat([new_video_metadata_keywordsearch, new_video_metadata_profilesearch], ignore_index=True)
new_video_metadata["extractionTime"] = pd.Timestamp.utcnow()

# Define the file path
video_metadata_path = f"results/{PROJECT}/video_metadata.csv"

if os.path.exists(video_metadata_path):
    # Load the existing file
    old_video_metadata = pd.read_csv(video_metadata_path)
    old_video_metadata["id"] = old_video_metadata["id"].astype("str")
    
    # Append new data
    updated_video_metadata = pd.concat([old_video_metadata, new_video_metadata])
    
else:
    # Save the new data
    updated_video_metadata = new_video_metadata

# Remove duplicates, keeping the latest entry 
dict_columns = [col for col in updated_video_metadata.columns if updated_video_metadata[col].apply(lambda x: isinstance(x, (dict, list))).any()]
updated_video_metadata.drop_duplicates(
    subset=[col for col in updated_video_metadata.columns if col != "extractionTime" and col not in dict_columns],
    keep='last', 
    inplace=True
)

# Save the combined data
updated_video_metadata.to_csv(video_metadata_path, index=False)

# Display the head of the DataFrame
print(updated_video_metadata.shape)
updated_video_metadata.head()

# Extract/Update Profile Information

In [None]:
# Extract the authorMeta field
new_profile_metadata = updated_video_metadata[["authorMeta", "extractionTime"]]

# Convert the authorMeta dictionary to separate columns
new_profile_metadata = pd.json_normalize(new_profile_metadata['authorMeta']).join(new_profile_metadata['extractionTime'])
new_profile_metadata.rename(columns={"name": "profile"}, inplace=True)
new_profile_metadata["id"] = new_profile_metadata["id"].astype("str")

# Check if the file exists
profile_metadata_path = f"results/{PROJECT}/profile_metadata.csv"
if os.path.exists(profile_metadata_path):
    # Load the existing file
    prev_profile_metadata = pd.read_csv(profile_metadata_path)
    prev_profile_metadata["id"] = prev_profile_metadata["id"].astype("str")
    
    # Append new data
    updated_profile_metadata = pd.concat([prev_profile_metadata, new_profile_metadata])
    
else:
    # Save the new data
    updated_profile_metadata = new_profile_metadata

# Remove duplicates, keeping the latest entry 
dict_columns = [col for col in updated_profile_metadata.columns if updated_profile_metadata[col].apply(lambda x: isinstance(x, (dict, list))).any()]
updated_profile_metadata.drop_duplicates(
    subset=[col for col in updated_profile_metadata.columns if col != "extractionTime" and col not in dict_columns],
    keep='last', 
    inplace=True
)

# Drop invalid profiles
updated_profile_metadata = updated_profile_metadata[updated_profile_metadata["id"]!='nan'].reset_index(drop=True)

# Save profile metadata locally
updated_profile_metadata.to_csv(profile_metadata_path, index=False)

# Display the head of the DataFrame
print(updated_profile_metadata.shape)
updated_profile_metadata.head()