<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/youtube/video2graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
!pip install openai youtube-transcript-api neo4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting neo4j
  Downloading neo4j-5.6.0.tar.gz (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.2/171.2 KB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: neo4j
  Building wheel for neo4j (pyproject.toml) ... [?25l[?25hdone
  Created wheel for neo4j: filename=neo4j-5.6.0-py3-none-any.whl size=237773 sha256=528eec35f45810df45c6dc5147c40076d13617e57fdbe32efeb3e36ecab6a52b
  Stored in directory: /root/.cache/pip/wheels/c0/9b/d9/fdb6b67a6f6d7aef4acaefe55f339739caf09bb63e43bfb10e
Successfully built neo4j
Installing collected packages: neo4j
Successfully installed neo4j-5.6.0


In [51]:
from neo4j import GraphDatabase
uri = "bolt://18.207.186.117:7687"
username = "neo4j"
password = "magazine-scream-roadside"
driver = GraphDatabase.driver(uri, auth=(username, password))

In [8]:
from youtube_transcript_api import YouTubeTranscriptApi
# CC BY 4 license
video_id = "0ZEBNpiuMu4"

transcript = YouTubeTranscriptApi.get_transcript(video_id)

In [45]:
# Split into paragraphs and include start and end timestamps
paragraphs = []
current_paragraph = ''
start_time = None

for line in transcript:
    if '[Music]' in line['text']:
        # If we've reached the end of a paragraph, add it to the list of paragraphs
        if current_paragraph:
            end_time = line['start']
            paragraphs.append({
                'text': current_paragraph.strip(),
                'start_time': start_time,
                'end_time': end_time
            })
            current_paragraph = ''
            start_time = None
    else:
        # If this is the start of a new paragraph, record the start time
        if not start_time:
            start_time = line['start']
        
        # Add the line to the current paragraph
        current_paragraph += line['text'] + ' '

# If there's a paragraph left at the end, add it to the list of paragraphs
if current_paragraph:
    end_time = transcript[-1]['start'] + transcript[-1]['duration']
    paragraphs.append({
        'text': current_paragraph.strip(),
        'start_time': start_time,
        'end_time': end_time
    })

# Remove empty paragraphs
paragraphs = [p for p in paragraphs if p['text']]

In [47]:
# Number of paragraphs
print(f"Number of paragraphs: {len(paragraphs)}")
print(f"Max characters per paragraph: {max([len(el['text']) for el in paragraphs])}")

Number of paragraphs: 35
Max characters per paragraph: 1398


In [46]:
paragraphs[0]

{'text': 'it was in one of the galleries of st. Peters mount at about 500 paces from the main entry and at 90 feet below the surface the deployment exposed part of the skull of a large animal embedded in this stone they suspended their work to tell of their discovery to dr. Hoffman who had for some years been collecting fossils from the quarries dr. Hoffman observing a specimen to be the most important that had yet been discovered took every precaution to preserve it in one piece after having succeeded in removing a large block of stone surrounding it and reducing the mass to a proper condition it was transported to his home in triumph',
 'start_time': 9.65,
 'end_time': 64.069}

In [52]:
def parse_entities_and_relationships(input_str):
    # Parse the input string
    entities = []
    relationships = []
    entity_mode = True
    # Skip the first line
    for line in input_str.split('\n')[1:]:
        if line == 'relationships':
            entity_mode = False
        elif line:
            if entity_mode:
                entities.append(line)
            else:
                relationships.append(line.split(', '))
    return entities, relationships

In [64]:
import openai
openai.api_key = "OPENAI_KEY"

def gpt4archeologist(text):
  system = "You are an archeology and biology expert helping us extract relevant information."

  # Set up the prompt for GPT-3 to complete
  prompt = """#This a transcript from a sea documentary. The task is to extract as many relevant entities to biology, chemistry, or archeology.
#The entities should include all animals, biological entities, locations.
#However, the entities should not include distances or time durations.
#Additionally, extract all relevant relationships between identified entities.
#The relationships should follow the Wikipedia schema type.
#The output of a relationship should be in a form of a triple Head, Relationship, Tail, for example
#Peter, WORKS_AT, Hospital/n
# An output should be have the following format
entity
St. Peters Mount
galleries

relationships
large animal, EMBEDDED_IN, stone\n"""


  paragraph = text

  completion = openai.ChatCompletion.create(
      model="gpt-4",
      temperature=0,
      messages=[{"role": "system", "content": system},
          {"role": "user", "content": prompt + paragraph}]
      )

  nlp_results = completion.choices[0].message.content
  return parse_entities_and_relationships(nlp_results)

In [65]:
import_query = """
MERGE (v:Video {id:$videoId})
CREATE (v)-[:HAS_SECTION]->(p:Section)
SET p.startTime = toFloat($start),
    p.endTime = toFloat($end),
    p.text = $text
FOREACH (e in $entities | MERGE (entity:Entity {name: e}) MERGE (p)-[:MENTIONS]->(entity))
WITH p
UNWIND $relationships AS relation
MERGE (source:Entity {name: relation[0]})
MERGE (target:Entity {name: relation[2]})
MERGE (source)-[:RELATIONSHIP]->(r:Relationship {type: relation[1]})-[:RELATIONSHIP]->(target)
MERGE (p)-[mr:MENTIONS_RELATIONSHIP]->(r)
"""

with driver.session() as session:
  for i, paragraph in enumerate(paragraphs):
    print(f"Processing {i} paragraph")
    text = paragraph['text']
    start = paragraph['start_time']
    end = paragraph['end_time']
    entities, relationships = gpt4archeologist(text)
    params = {'videoId': video_id, 'start': start, 'end': end, 'text':text, 'entities': entities, 'relationships': relationships}
    session.run(import_query, params)


Processing 0 paragraph
Processing 1 paragraph
Processing 2 paragraph
Processing 3 paragraph
Processing 4 paragraph
Processing 5 paragraph
Processing 6 paragraph
Processing 7 paragraph
Processing 8 paragraph
Processing 9 paragraph
Processing 10 paragraph
Processing 11 paragraph
Processing 12 paragraph
Processing 13 paragraph
Processing 14 paragraph
Processing 15 paragraph
Processing 16 paragraph
Processing 17 paragraph
Processing 18 paragraph
Processing 19 paragraph
Processing 20 paragraph
Processing 21 paragraph
Processing 22 paragraph
Processing 23 paragraph
Processing 24 paragraph
Processing 25 paragraph
Processing 26 paragraph
Processing 27 paragraph
Processing 28 paragraph
Processing 29 paragraph
Processing 30 paragraph
Processing 31 paragraph
Processing 32 paragraph
Processing 33 paragraph
Processing 34 paragraph
