<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/youtube/video2graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install openai youtube-transcript-api neo4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from neo4j import GraphDatabase
uri = "bolt://18.207.186.117:7687"
username = "neo4j"
password = "magazine-scream-roadside"
driver = GraphDatabase.driver(uri, auth=(username, password))

In [3]:
from youtube_transcript_api import YouTubeTranscriptApi
# CC BY 4 license
video_id = "0ZEBNpiuMu4"

transcript = YouTubeTranscriptApi.get_transcript(video_id)

In [4]:
# Split into paragraphs and include start and end timestamps
paragraphs = []
current_paragraph = ''
start_time = None

for line in transcript:
    if '[Music]' in line['text']:
        # If we've reached the end of a paragraph, add it to the list of paragraphs
        if current_paragraph:
            end_time = line['start']
            paragraphs.append({
                'text': current_paragraph.strip(),
                'start_time': start_time,
                'end_time': end_time
            })
            current_paragraph = ''
            start_time = None
    else:
        # If this is the start of a new paragraph, record the start time
        if not start_time:
            start_time = line['start']
        
        # Add the line to the current paragraph
        current_paragraph += line['text'] + ' '

# If there's a paragraph left at the end, add it to the list of paragraphs
if current_paragraph:
    end_time = transcript[-1]['start'] + transcript[-1]['duration']
    paragraphs.append({
        'text': current_paragraph.strip(),
        'start_time': start_time,
        'end_time': end_time
    })

# Remove empty paragraphs
paragraphs = [p for p in paragraphs if p['text']]

In [5]:
# Number of paragraphs
print(f"Number of paragraphs: {len(paragraphs)}")
print(f"Max characters per paragraph: {max([len(el['text']) for el in paragraphs])}")

Number of paragraphs: 35
Max characters per paragraph: 1398


In [6]:
paragraphs[0]

{'text': 'it was in one of the galleries of st. Peters mount at about 500 paces from the main entry and at 90 feet below the surface the deployment exposed part of the skull of a large animal embedded in this stone they suspended their work to tell of their discovery to dr. Hoffman who had for some years been collecting fossils from the quarries dr. Hoffman observing a specimen to be the most important that had yet been discovered took every precaution to preserve it in one piece after having succeeded in removing a large block of stone surrounding it and reducing the mass to a proper condition it was transported to his home in triumph',
 'start_time': 9.65,
 'end_time': 64.069}

In [7]:
def parse_entities_and_relationships(input_str):
    # Parse the input string
    entities = []
    relationships = []
    entity_mode = True
    # Skip the first line
    for line in input_str.split('\n')[1:]:
        if line == 'relationships':
            entity_mode = False
        elif line:
            if entity_mode:
                entities.append(line.split(', '))
            else:
                # Make sure the rel is in correct format
                # GPT-4 sometimes returns n/a when no rels are found
                if len(line.split(', ')) != 3:
                  continue
                relationships.append(line.split(', '))
    return entities, relationships

In [8]:
import openai
openai.api_key = "OPENAI_KEY"

def gpt4archeologist(text):
  system = "You are an archeology and biology expert helping us extract relevant information."

  # Set up the prompt for GPT-3 to complete
  prompt = """#This a transcript from a sea documentary. The task is to extract as many relevant entities to biology, chemistry, or archeology.
#The entities should include all animals, biological entities, locations.
#However, the entities should not include distances or time durations.
#Also, return the type of an entity using the Wikipedia class system and the sentiment of the mentioned entity,
#where the sentiment value ranges from -1 to 1, and -1 being very negative, 1 being very positive
#Additionally, extract all relevant relationships between identified entities.
#The relationships should follow the Wikipedia schema type.
#The output of a relationship should be in a form of a triple Head, Relationship, Tail, for example
#Peter, WORKS_AT, Hospital/n
# An example "St. Peter is located in Paris" should have an output with the following format
entity
St. Peter, person, 0.0
Paris, location, 0.0

relationships
St.Peter, LOCATED_IN, Paris\n"""


  paragraph = text

  completion = openai.ChatCompletion.create(
      model="gpt-4",
      #Try to be as deterministic as possible
      temperature=0,
      messages=[{"role": "system", "content": system},
          {"role": "user", "content": prompt + paragraph}]
      )

  nlp_results = completion.choices[0].message.content
  return parse_entities_and_relationships(nlp_results)

In [9]:
import_query = """
MERGE (v:Video {id:$videoId})
CREATE (v)-[:HAS_SECTION]->(p:Section)
SET p.startTime = toFloat($start),
    p.endTime = toFloat($end),
    p.text = $text
FOREACH (e in $entities |
  MERGE (entity:Entity {name: e[0]})
  ON CREATE SET entity.type = e[1] 
  MERGE (p)-[:MENTIONS{sentiment:toFloat(e[2])}]->(entity))
WITH p
UNWIND $relationships AS relation
MERGE (source:Entity {name: relation[0]})
MERGE (target:Entity {name: relation[2]})
MERGE (source)-[:RELATIONSHIP]->(r:Relationship {type: relation[1]})-[:RELATIONSHIP]->(target)
MERGE (p)-[mr:MENTIONS_RELATIONSHIP]->(r)
"""

with driver.session() as session:
  for i, paragraph in enumerate(paragraphs):
    print(f"Processing {i} paragraph")
    text = paragraph['text']
    start = paragraph['start_time']
    end = paragraph['end_time']
    entities, relationships = gpt4archeologist(text)
    params = {'videoId': video_id, 'start': start, 'end': end, 'text':text, 'entities': entities, 'relationships': relationships}
    session.run(import_query, params)


Processing 0 paragraph
Processing 1 paragraph
Processing 2 paragraph
Processing 3 paragraph
Processing 4 paragraph
Processing 5 paragraph
Processing 6 paragraph
Processing 7 paragraph
Processing 8 paragraph
Processing 9 paragraph
Processing 10 paragraph
Processing 11 paragraph
Processing 12 paragraph
Processing 13 paragraph
Processing 14 paragraph
Processing 15 paragraph
Processing 16 paragraph
Processing 17 paragraph
Processing 18 paragraph
Processing 19 paragraph
Processing 20 paragraph
Processing 21 paragraph
Processing 22 paragraph
Processing 23 paragraph
Processing 24 paragraph


KeyboardInterrupt: ignored

# Analysis

In [None]:
import pandas as pd

pd.set_option("display.max_colwidth", 150)

def run_query(query):
    with driver.session() as session:
        result = session.run(query)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [None]:
run_query("""
MATCH (e:Entity)
RETURN e.name, e.type,
       count{(e)<-[:MENTIONS]-()} AS mentions
ORDER BY mentions DESC
LIMIT 5
""")

In [None]:
run_query("""
MATCH (e:Entity {type:"person"})
RETURN e.name, e.type,
       count{(e)<-[:MENTIONS]-()} AS mentions
ORDER BY mentions DESC
LIMIT 5
""")

In [None]:
run_query("""
MATCH (e:Entity {name:"Mary Anning"})-[:RELATIONSHIP]->(r)-[:RELATIONSHIP]->(target)
RETURN e.name AS source, r.type AS relationship, target.name AS target
UNION ALL
MATCH (e:Entity {name:"Mary Anning"})<-[:RELATIONSHIP]->(r)<-[:RELATIONSHIP]-(source)
RETURN source.name AS source, r.type AS relationship, e.name AS target 
""")

In [None]:
run_query("""
MATCH (e:Entity)<-[m:MENTIONS]-(section:Section)
WITH e,section, m.sentiment AS sentiment
ORDER BY sentiment ASC
LIMIT 5
RETURN e.name AS entity, sentiment, section.text AS text
""")

In [None]:
run_query("""
MATCH (e:Entity {type:"person"})<-[m:MENTIONS]-(section:Section)
WITH e,section, m.sentiment AS sentiment
ORDER BY sentiment ASC
LIMIT 5
RETURN e.name AS entity, sentiment, section.text AS text
""")

In [None]:
run_query("""
MATCH (e:Entity {name:"Ryo Tsuki Mutiny"})<-[:MENTIONS]-(s:Section)<-[:HAS_SECTION]-(v:Video)
RETURN s.startTime AS timestamp, s.endTime AS endTime, "https://youtube.com/watch?v=" + v.id + "&t=" + toString(toInteger(s.startTime)) AS URL
""")
