<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/youtube/video2graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install openai youtube-transcript-api neo4j retry



In [2]:
import pandas as pd
import openai
from retry import retry
from neo4j import GraphDatabase

pd.set_option("display.max_colwidth", 150)

openai.api_key = "OPENAI_KEY"

uri = "bolt://18.207.186.117:7687"
username = "neo4j"
password = "magazine-scream-roadside"
driver = GraphDatabase.driver(uri, auth=(username, password))

In [3]:
def run_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [4]:
from youtube_transcript_api import YouTubeTranscriptApi

# CC BY 4 license
video_id = "nrI483C5Tro"
transcript = YouTubeTranscriptApi.get_transcript(video_id)

In [5]:
# Split into sections and include start and end timestamps
sections = []
current_section = ""
start_time = None
previous_end = 0
pause_threshold = 5

for line in transcript:
    if current_section and (line["start"] - previous_end > pause_threshold):
        # If there is a pause greater than 5s, we deem the end of section
        end_time = line["start"]
        sections.append(
            {
                "text": current_section.strip(),
                "start_time": start_time,
                "end_time": end_time,
            }
        )
        current_section = ""
        start_time = None
    else:
        # If this is the start of a new section, record the start time
        if not start_time:
            start_time = line["start"]

        # Add the line to the current paragraph
        clean_text = line["text"].replace("\n", " ").replace("\xa0", " ")
        current_section += " ".join(clean_text.split()) + " "
        # Tag the end of the dialogue
        previous_end = line["start"] + line["duration"]

# If there's a paragraph left at the end, add it to the list of paragraphs
if current_section:
    end_time = transcript[-1]["start"] + transcript[-1]["duration"]
    sections.append(
        {
            "text": current_section.strip().replace("\n", " ").replace("\xa0", " "),
            "start_time": start_time,
            "end_time": end_time,
        }
    )
# Remove empty paragraphs
sections = [p for p in sections if p["text"]]


In [6]:
# Number of paragraphs
print(f"Number of paragraphs: {len(sections)}")
print(f"Max characters per paragraph: {max([len(el['text']) for el in sections])}")

Number of paragraphs: 77
Max characters per paragraph: 1267


In [7]:
sections[0]

{'text': 'water the liquid that oceans are made of and it fills endless depths only few will venture out into the endless open ocean of this vast underwater world most of the ocean inhabitants live in the city as it were like human societies very close together with friendly Neighbors and nasty cotenants while dangerous robbers lurk around at the edge of town',
 'start_time': 5.46,
 'end_time': 49.08}

In [8]:
def parse_entities_and_relationships(input_str):
    # Parse the input string
    entities = []
    relationships = []
    entity_mode = True
    # Skip the first line
    for line in input_str.split("\n")[1:]:
        if line == "relationships":
            entity_mode = False
        elif line:
            if entity_mode:
                # Make sure the rel is in correct format
                # GPT-4 sometimes returns n/a when no entities are found
                if len(line.split(", ")) != 3:
                    continue
                entities.append(line.split(", "))
            else:
                # Make sure the rel is in correct format
                # GPT-4 sometimes returns n/a when no rels are found
                if len(line.split(", ")) != 3:
                    continue
                relationships.append(line.split(", "))
    return entities, relationships

In [9]:
system = "You are an archeology and biology expert helping us extract relevant information."

    # Set up the prompt for GPT-3 to complete
prompt = """#This a transcript from a sea documentary. The task is to extract as many relevant entities to biology, chemistry, or archeology.
#The entities should include all animals, biological entities, locations.
#However, the entities should not include distances or time durations.
#Also, return the type of an entity using the Wikipedia class system and the sentiment of the mentioned entity,
#where the sentiment value ranges from -1 to 1, and -1 being very negative, 1 being very positive
#Additionally, extract all relevant relationships between identified entities.
#The relationships should follow the Wikipedia schema type.
#The output of a relationship should be in a form of a triple Head, Relationship, Tail, for example
#Peter, WORKS_AT, Hospital/n
# An example "St. Peter is located in Paris" should have an output with the following format
entity
St. Peter, person, 0.0
Paris, location, 0.0

relationships
St.Peter, LOCATED_IN, Paris\n"""

@retry(tries=3, delay=5)
def process_gpt4(text):
    paragraph = text

    completion = openai.ChatCompletion.create(
        model="gpt-4",
        # Try to be as deterministic as possible
        temperature=0,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": prompt + paragraph},
        ],
    )

    nlp_results = completion.choices[0].message.content
    
    if not "relationships" in nlp_results:
        raise Exception(
            "GPT-4 is not being nice and isn't returning results in correct format"
        )
    
    return parse_entities_and_relationships(nlp_results)


In [10]:
import_query = """
MERGE (v:Video {id:$videoId})
CREATE (v)-[:HAS_SECTION]->(p:Section)
SET p.startTime = toFloat($start),
    p.endTime = toFloat($end),
    p.text = $text
FOREACH (e in $entities |
  MERGE (entity:Entity {name: e[0]})
  ON CREATE SET entity.type = e[1] 
  MERGE (p)-[:MENTIONS{sentiment:toFloat(e[2])}]->(entity))
WITH p
UNWIND $relationships AS relation
MERGE (source:Entity {name: relation[0]})
MERGE (target:Entity {name: relation[2]})
MERGE (source)-[:RELATIONSHIP]->(r:Relationship {type: relation[1]})-[:RELATIONSHIP]->(target)
MERGE (p)-[mr:MENTIONS_RELATIONSHIP]->(r)
"""

with driver.session() as session:
    for i, section in enumerate(sections):
        print(f"Processing {i} paragraph")
        text = section["text"]
        start = section["start_time"]
        end = section["end_time"]
        try:
            entities, relationships = process_gpt4(text)
            params = {
                "videoId": video_id,
                "start": start,
                "end": end,
                "text": text,
                "entities": entities,
                "relationships": relationships,
            }
            session.run(import_query, params)
        except:
            pass

Processing 0 paragraph
Processing 1 paragraph
Processing 2 paragraph
Processing 3 paragraph
Processing 4 paragraph
Processing 5 paragraph
Processing 6 paragraph
Processing 7 paragraph
Processing 8 paragraph
Processing 9 paragraph
Processing 10 paragraph
Processing 11 paragraph
Processing 12 paragraph
Processing 13 paragraph
Processing 14 paragraph
Processing 15 paragraph
Processing 16 paragraph
Processing 17 paragraph
Processing 18 paragraph
Processing 19 paragraph
Processing 20 paragraph
Processing 21 paragraph
Processing 22 paragraph
Processing 23 paragraph
Processing 24 paragraph
Processing 25 paragraph
Processing 26 paragraph
Processing 27 paragraph
Processing 28 paragraph
Processing 29 paragraph
Processing 30 paragraph
Processing 31 paragraph
Processing 32 paragraph
Processing 33 paragraph
Processing 34 paragraph
Processing 35 paragraph
Processing 36 paragraph
Processing 37 paragraph
Processing 38 paragraph
Processing 39 paragraph
Processing 40 paragraph
Processing 41 paragraph
Pr

# Entity disambiguation

In [11]:
disambiguation_prompt = """
#Act as a entity disambiugation tool and tell me which values reference the same entity. 
#For example if I give you
#
#Birds
#Bird
#Ant
#
#You return to me
#
#Birds, 1
#Bird, 1
#Ant, 2
#
#As the Bird and Birds values have the same integer assigned to them, it means that they reference the same entity.
#Now process the following values\n
"""

def disambiguate(entities):
    completion = openai.ChatCompletion.create(
        model="gpt-4",
        # Try to be as deterministic as possible
        temperature=0,
        messages=[
            {"role": "user", "content": disambiguation_prompt + "\n".join(all_animals)},
        ],
    )

    disambiguation_results = completion.choices[0].message.content
    return [row.split(", ") for row in disambiguation_results.split("\n")]

In [12]:
all_animals = run_query("""
MATCH (e:Entity {type: 'animal'})
RETURN e.name AS animal
""")['animal'].to_list()


disambiguation_params = disambiguate(all_animals)
run_query(
    """
UNWIND $data AS row
MATCH (e:Entity {name:row[0]})
SET e.disambiguation = row[1]
""",
    {"data": disambiguation_params},
)


In [13]:
run_query("""
MATCH (e:Entity {type:"animal"})
RETURN e.disambiguation AS i, collect(e.name) AS entities
ORDER BY size(entities) DESC
LIMIT 5
""")

Unnamed: 0,i,entities
0,22,"[moray eel, Moray, Moray Eel, moray, morays]"
1,16,"[lionfish, lionfishes, Lionfish]"
2,9,"[Brittle star, brittle stars, brittle star]"
3,6,"[hermit crab, Hermit crab]"
4,5,"[Monkfish, monkfish]"


In [14]:
run_query("""
MATCH (e:Entity {type:"animal"})
WITH e.disambiguation AS i, collect(e) AS entities
CALL apoc.refactor.mergeNodes(entities, {mergeRels:True})
YIELD node
RETURN distinct 'done'
""")

Unnamed: 0,'done'
0,done


# Analysis

In [15]:
run_query("""
MATCH (e:Entity {type:"person"})
RETURN e.name, e.type,
       count{(e)<-[:MENTIONS]-()} AS mentions
ORDER BY mentions DESC
LIMIT 5
""")

Unnamed: 0,e.name,e.type,mentions
0,chief,person,1
1,strange visitors,person,1
2,divers,person,1


In [16]:
run_query("""
MATCH (e:Entity {type:"animal"})
RETURN e.name, e.type,
       count{(e)<-[:MENTIONS]-()} AS mentions
ORDER BY mentions DESC
LIMIT 5
""")

Unnamed: 0,e.name,e.type,mentions
0,morays,animal,7
1,Lionfish,animal,5
2,brittle star,animal,3
3,monkfish,animal,3
4,Cardinal fish,animal,3


In [17]:
run_query("""
MATCH (e:Entity {name:"Lionfish"})-[:RELATIONSHIP]->(r)-[:RELATIONSHIP]->(target)
RETURN e.name AS source, r.type AS relationship, target.name AS target,
       count{(r)<-[:MENTIONS_RELATIONSHIP]-()} AS mentions
UNION ALL
MATCH (e:Entity {name:"Lionfish"})<-[:RELATIONSHIP]->(r)<-[:RELATIONSHIP]-(source)
RETURN source.name AS source, r.type AS relationship, e.name AS target,
       count{(r)<-[:MENTIONS_RELATIONSHIP]-()} AS mentions
""")

Unnamed: 0,source,relationship,target,mentions
0,Lionfish,HUNTS_IN,Reef,1
1,Lionfish,CARRY,poison,1
2,Lionfish,USES,Camouflage,1
3,Lionfish,USE,dive torches,1
4,Lionfish,HAS,Fins,1
5,Lionfish,POSE_DANGER_TO,Cardinal fish,1
6,Lionfish,PREDATOR_OF,Small prey,1
7,Eggs,EATEN_BY,Lionfish,1
8,morays,INTERACTS_WITH,Lionfish,1
9,Cardinal,EATEN_BY,Lionfish,1


In [21]:
run_query("""
MATCH (e:Entity)-[:RELATIONSHIP]->(r)-[:RELATIONSHIP]->(t:Entity)
WHERE e.name = "Lionfish" AND r.type = "POSE_DANGER_TO" AND t.name = "Cardinal fish"
MATCH (r)<-[:MENTIONS_RELATIONSHIP]-(s:Section)
RETURN s.text AS text
""")

Unnamed: 0,text
0,the snail is fast and the prey disappears in the blink of an eye the night is still young and still full of danger and Terror these lionfishes pos...


In [19]:
run_query("""
MATCH (e:Entity {type:"animal"})<-[m:MENTIONS]-(section:Section)
WITH e,section, m.sentiment AS sentiment
ORDER BY sentiment ASC
LIMIT 5
RETURN e.name AS entity, sentiment, section.text AS text
""")

Unnamed: 0,entity,sentiment,text
0,Cardinal fish,-0.5,the snail is fast and the prey disappears in the blink of an eye the night is still young and still full of danger and Terror these lionfishes pos...
1,monkfish,-0.5,they're clearly telling this hermit crab to take a hike but they should have stayed with their offspring because there's new danger approaching a ...
2,sea urchin,-0.5,Sandy ocean floor and this means having to constantly protect this open Egg site on occasion they blow and fan the eggs but mostly they keep an ey...
3,sea urchin,-0.5,again and again Until It Breaks the sea urchin's guts spill out into the open a few more hits for good measure the danger has been averted everyth...
4,monkfish,-0.5,the hermit crab's best defense is its house that it carries around for protection the patience of the monkfishes is wearing thin


In [20]:
run_query("""
MATCH (e:Entity {name:"Lionfish"})<-[:MENTIONS]-(s:Section)<-[:HAS_SECTION]-(v:Video)
RETURN s.startTime AS timestamp, s.endTime AS endTime, "https://youtube.com/watch?v=" + v.id + "&t=" + toString(toInteger(s.startTime)) AS URL
""")


Unnamed: 0,timestamp,endTime,URL
0,1537.86,1560.48,https://youtube.com/watch?v=nrI483C5Tro&t=1537
1,1664.64,1703.52,https://youtube.com/watch?v=nrI483C5Tro&t=1664
2,1514.58,1529.1,https://youtube.com/watch?v=nrI483C5Tro&t=1514
3,648.18,667.92,https://youtube.com/watch?v=nrI483C5Tro&t=648
4,1455.24,1498.38,https://youtube.com/watch?v=nrI483C5Tro&t=1455
