Base imports.

In [26]:
import os
import json
import pandas as pd
from pprint import pprint
from IPython.display import Markdown

from helpers import _set_env
_set_env("OPENAI_API_KEY")
_set_env("ANTHROPIC_API_KEY")
_set_env("TAVILY_API_KEY")
_set_env("LANGCHAIN_API_KEY")
_set_env("LINKEDIN_COOKIE_LI_AT")
_set_env("LINKEDIN_COOKIE_JSESSIONID")

import warnings
warnings.filterwarnings('ignore')

agent_name = "podcast_outline"
collected_data_dir = f"../collected_data/{agent_name}"
os.makedirs(collected_data_dir, exist_ok=True)
base_data_dir = f"../base_data/{agent_name}"
os.makedirs(base_data_dir, exist_ok=True)

Import CrewAI, tools, and LLM / AI agent-related dependencies.

In [2]:
from crewai import Agent, Task, Crew, LLM
from crewai.tools import BaseTool
from crewai_tools import FileReadTool, ScrapeWebsiteTool, YoutubeVideoSearchTool
from langchain_community.tools.tavily_search import TavilySearchResults
from pydantic import BaseModel, Field
from typing import Optional

Initialize the LLM to be used. 

*NOTE: Later you can customize or use different LLMs for each agent or task.*

In [3]:
llm = LLM(model="anthropic/claude-3-5-sonnet-20241022", temperature=1.0)

Connect to LinkedIn API.

In [4]:
with open("../linkedin_credentials.json", "r") as f:
    credentials = json.load(f)

from requests.cookies import cookiejar_from_dict
cookies = cookiejar_from_dict({
    "liap": "true",
    "li_at": os.environ["LINKEDIN_COOKIE_LI_AT"],
    "JSESSIONID": os.environ["LINKEDIN_COOKIE_JSESSIONID"],
})

from linkedin_api import Linkedin
linkedin = Linkedin(credentials["username"], credentials["password"], cookies=cookies)

Instantiate tools.

In [7]:
file_read_tool = FileReadTool()
scrape_website_tool = ScrapeWebsiteTool()
youtube_video_search_tool = YoutubeVideoSearchTool()

Create custom tool for Tavily search -- this is supposed to work better than Serper API and other search tools as it's LLM-tailored.

In [8]:
class WebSearchTool(BaseTool):
    name: str ="Web Search Tool"
    description: str = "Search the web for current information on a given topic or person"
        
    def _run(self, query: str, max_results: Optional[int] = 10) -> str:
        # Perform the search
        search_client = TavilySearchResults(max_results=max_results)
        search_results = search_client.invoke(query)
        
        # Format the results
        formatted_results = []
        for doc in search_results:
            formatted_doc = f"URL: {doc['url']}\n"
            formatted_doc += f"Content: {doc['content']}\n"
            formatted_results.append(formatted_doc)
            
        return "\n---\n".join(formatted_results)

In [9]:
web_search_tool = WebSearchTool()

Create agents.

In [27]:
# Create the researcher agent
local_researcher = Agent(
    role='Senior Podcast Guest Researcher (Local)',
    goal='Thoroughly research the guest to understand their expertise, background, and recent work',
    backstory="""You are an expert podcast researcher who excels at finding and synthesizing 
    information about podcast guests. You analyze their background, work, and LinkedIn profiles and posts
    to understand their unique perspectives and contributions to the specific topic of the podcast.""",
    tools=[file_read_tool],
    llm=llm,
    allow_delegation=False,
    verbose=True,
)

# Create the researcher agent with web search tool
online_researcher = Agent(
    role='Senior Podcast Guest Researcher (Online)',
    goal='Thoroughly research the guest to understand their expertise, background, and recent work',
    backstory="""You are an expert podcast researcher who excels at finding and synthesizing 
    online information about podcast guests. You analyze their background, work, and online presence
    to understand their unique perspectives and contributions to the specific topic of the podcast.""",
    tools=[web_search_tool],
    llm=llm,
    allow_delegation=False,
    verbose=True,
)

# Create the outline writer agent
outline_writer = Agent(
    role='Senior Podcast Outline Writer',
    goal='Create engaging, personalized podcast outlines that highlight the guest\'s expertise',
    backstory="""You are an experienced podcast producer who knows how to structure 
    compelling conversations. You excel at creating outlines that flow naturally and 
    bring out the best in each guest.""",
    tools=[file_read_tool],
    llm=llm,
    allow_delegation=False,
    verbose=True
)

Create the tasks.

In [28]:
local_research_task = Task(
    description="""
    You are researching the podcast guest {guest_name} for the podcast {podcast_name}.
    The podcast description is as follows: {podcast_description}.
    
    First, read the following document to understand the podcast's purpose and structure: {podcast_introduction_filepath}.
    
    Then read more about the guest {guest_name} using the following documents (using the file_read_tool):
    {guest_linkedin_profile_filepath}
    {guest_linkedin_posts_filepath}
    
    Focus on:
    1. Their professional background and expertise in relation to the podcast's topic
    2. Recent projects, achievements, startups, products, and services related to the podcast's topic
    3. Their thought leadership and key ideas related to the podcast's topic
    4. Any unique perspectives or experiences they bring related to the podcast's topic
    
    VERY IMPORTANT: When you're compiling your findings, 
    MAKE SURE TO GIVE MORE IMPORTANCE TO RECENT FINDINGS. For example, if the guest has a recent project, 
    make sure to give more importance to that project. If you focus on the guest's project from 10 years ago that
    is discontinued, it's not useful.
    
    Compile your findings into a comprehensive research summary.""",
    expected_output="""A detailed research summary containing:
    1. Professional background and current role
    2. Key achievements and notable projects
    3. Areas of expertise and thought leadership
    4. Recent activities and public presence
    5. Unique perspectives or specialized knowledge
    
    The summary should combine information from provided documents,
    ensuring all facts are verified and relevant to podcast discussion topics.""",
    agent=local_researcher,
)

online_research_task = Task(
    description="""
    You are researching the podcast guest {guest_name} for the podcast {podcast_name}.
    The podcast description is as follows: {podcast_description}.
    
    Using the web_search_tool, find relevant information online about the guest {guest_name}. 
    To tailor the search query, use the following information: {guest_details}.
    
    Focus on:
    1. Their professional background and expertise in relation to the podcast's topic
    2. Recent projects, achievements, startups, products, and services related to the podcast's topic
    3. Their thought leadership and key ideas related to the podcast's topic
    4. Any unique perspectives or experiences they bring related to the podcast's topic
    
    VERY IMPORTANT: When you're compiling your findings, 
    MAKE SURE TO GIVE MORE IMPORTANCE TO RECENT FINDINGS. For example, if the guest has a recent project, 
    make sure to give more importance to that project. If you focus on the guest's project from 10 years ago that
    is discontinued, it's not useful.
    
    Compile your findings into a comprehensive research summary.""",
    expected_output="""A detailed research summary containing:
    1. Professional background and current role
    2. Key achievements and notable projects
    3. Areas of expertise and thought leadership
    4. Recent activities and public presence
    5. Unique perspectives or specialized knowledge
    
    The summary should combine information from provided information and web research,
    ensuring all facts are verified and relevant to podcast discussion topics.""",
    agent=online_researcher,
)

outline_task = Task(
    description="""
    You are writing the podcast outline for guest {guest_name} for the podcast {podcast_name}.
    The podcast description is as follows: {podcast_description}.
    
    First, read the following document to understand the podcast's purpose and structure: {podcast_introduction_filepath}.
    
    Then using the research provided by the Senior Podcast Guest Researcher, create a one-pager podcast outline that:
    1. Starts with a note encouraging the guest to edit the document
    2. Includes a 2-3 sentence introduction about the guest
    3. Suggests 4-5 relevant segments tailored to the guest's expertise
    4. Provides brief descriptions and specific questions for each segment
    
    The outline should be conversational, engaging, and specifically tailored to the guest's 
    background and expertise. Ensure questions are clear and accessible.""",
    expected_output="""A structured podcast outline document containing:
    1. An opening note inviting guest feedback
    2. A concise guest introduction
    3. 4-5 clearly defined podcast segments
    4. Specific questions and talking points for each segment
    
    The outline should be personalized to the guest's expertise and maintain
    an engaging, conversational flow suitable for a podcast format.""",
    agent=outline_writer,
)

Create the crew.

In [None]:
local_podcast_crew = Crew(
    agents=[local_researcher, outline_writer],
    tasks=[local_research_task, outline_task],
    verbose=True,
    memory=True,
)

online_podcast_crew = Crew(
    agents=[online_researcher, outline_writer],
    tasks=[online_research_task, outline_task],
    verbose=True,
    memory=True,
)

Execute the crew.

In [20]:
profile_id = "<PROFILE_ID>"
profile = linkedin.get_profile(profile_id)
posts = linkedin.get_profile_posts(profile_id, post_count=100)

In [None]:
# Specifying the fields that are essential for posts (semantic content) -- otherwise it brings too much noisy metadata.
essential_post_fields = [
    'commentary.text.text',  # The actual post text written by the author
    'resharedUpdate.commentary.text.text',  # Text from reshared posts
    'actor.name.text',  # Author's name
    'resharedUpdate.actor.name.text',  # Name of original poster if reshared
    'actor.subDescription.text',  # Post timing (e.g., "4h • Edited")
]

os.makedirs(f"{collected_data_dir}/{profile_id}", exist_ok=True)
df_profile = pd.json_normalize(profile).T
df_profile.to_csv(f"{collected_data_dir}/{profile_id}/linkedin_profile.csv")
try:
    df_posts = pd.json_normalize(posts)[essential_post_fields].T
    df_posts.to_csv(f"{collected_data_dir}/{profile_id}/linkedin_posts.csv")
except Exception as e:
    print(f"Error processing posts (maybe there are no posts): {e}")
    df_posts = pd.DataFrame()

In [None]:
inputs = {
    "podcast_name": "<PODCAST_NAME>",
    "podcast_description": "<PODCAST_DESCRIPTION>",
    "guest_name": "<GUEST_NAME>",
    "guest_details": "<GUEST_DETAILS>",
    "podcast_introduction_filepath": f"{base_data_dir}/podcast_introduction.txt",
    "guest_linkedin_profile_filepath": f"{collected_data_dir}/{profile_id}/linkedin_profile.csv",
    "guest_linkedin_posts_filepath": f"{collected_data_dir}/{profile_id}/linkedin_posts.csv",
}

result = local_podcast_crew.kickoff(inputs=inputs)

In [None]:
Markdown(result.raw)

In [None]:
result = online_podcast_crew.kickoff(inputs=inputs)

In [None]:
Markdown(result.raw)