# Liquid News Backend V.1.0

Liquid News aims to help people better understand and interact with the news by providing a machine-learning-based analysis and semantic navigational aids. This will allow users to parse news via a semantic-relational model that leverages the latent connection between news segments. The hope is that this will uncover the relationships between topics covered across multiple news sources and promote a greater understanding of the news and media around us.

## Connect to Drive & Imports

This module will connect to your google drive, which this collab will use for storage in V1.0 of Liquid News. It will create a ```LiquidNews``` directory in the parent directory of your drive.

In [1]:
#Connect To Your Drive
import os
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/LiquidNews'
if not os.path.isdir(path):
  os.mkdir(path)    

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install git+https://github.com/openai/whisper.git
! pip install pytube
! pip install openai
! pip install pysrt
! pip install --upgrade youtube_dl
! pip install vtt_to_srt3
! pip install pysrt
! pip install openai
! pip install moviepy
! pip install imageio==2.4.1
! pip install git+https://github.com/openai/whisper.git
! pip install jiwer

In [8]:
from pytube import YouTube
import pysrt
import youtube_dl
from datetime import datetime
import vtt_to_srt.vtt_to_srt
import pandas as pd
from os.path import exists
from random import randint
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import os
import fnmatch
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib
from sklearn.cluster import KMeans
import numpy as np
import shutil
from pathlib import Path
import re
import whisper
import pickle
import uuid

## Initialize Whisper Transcription Model

The ```whisper_model``` variable is the OpenAI whisper model used to transcribe an audio file to text

In [15]:
whisper_model = whisper.load_model("large")

100%|█████████████████████████████████████| 2.87G/2.87G [01:12<00:00, 42.8MiB/s]


## Video Download and Information Retrieval Module

The following three methods are used to download a video from Youtube, download the audio file for a respective Youtube video, and get the metadata associated with the video (i.e., name, video_id)

In [3]:
def get_video_information(video_link):
  title, id = None, None
  ydl_opts_info = {"skip_download": True}
  with youtube_dl.YoutubeDL(ydl_opts_info) as ydl:
    title = ydl.extract_info(video_link, download=False).get('title',None)
    id = ydl.extract_info(video_link, download=False).get('id',None)
  return title, id

def download_video(video_link, output_path, skip_download=False):
  ydl_opts = {
      "writesubtitles": True, 
      "writeautomaticsub": True, 
      "skip_download": skip_download, 
      "subtitleslangs":["en"], 
      "outtmpl": f"{output_path}/%(id)s.%(ext)s"
  }
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
      ydl.download([video_link])

def download_video_audio(video_link, output_path, skip_download=False):
  ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
    "outtmpl": f"{output_path}/%(id)s.%(ext)s"
  }
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
      ydl.download([video_link])

## Transcription Module

The following methods leverage the ```whisper_model``` and the audio file associated with a video to generate audio transcription. The transcription is then converted into an SRT format used for video captioning.

In [4]:
def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
    assert seconds >= 0, "non-negative timestamp expected"
    milliseconds = round(seconds * 1000.0)

    hours = milliseconds // 3_600_000
    milliseconds -= hours * 3_600_000

    minutes = milliseconds // 60_000
    milliseconds -= minutes * 60_000

    seconds = milliseconds // 1_000
    milliseconds -= seconds * 1_000

    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
    return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"

def write_srt(transcript, file):
    """
    Write a transcript to a file in SRT format.
    Example usage:
        from pathlib import Path
        from whisper.utils import write_srt
        result = transcribe(model, audio_path, temperature=temperature, **args)
        # save SRT
        audio_basename = Path(audio_path).stem
        with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
            write_srt(result["segments"], file=srt)
    """
    for i, segment in enumerate(transcript, start=1):
        # write srt lines
        print(
            f"{i}\n"
            f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
            f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}\n"
            f"{segment['text'].strip().replace('-->', '->')}\n",
            file=file,
            flush=True,
        )

def audio_to_caption(audio_path, output_path, whisper_model):
  video_transcription = whisper.transcribe(
    whisper_model,
    str(audio_path),
    temperature=0.9,
    verbose=False
    )
  result = whisper_model.transcribe(audio_path)
  return result

## Transcription Tokenizer

The following ```caption_tokenizer``` method breaks down the transcription for a given video into a token based on sentence length. The user inputs the ```num_sentences_per_token``` parameter to determine the token size used for embedding

In [80]:
def caption_tokenizer(segments, num_sentences_per_token, id, liquid_videos_data):
  index = 0
  token = ""
  sentence_count = num_sentences_per_token
  start = segments[0]['start']
  while index < len(segments)-1:
    if any(punctuation in segments[index]['text'] for punctuation in [".","?","!"]) and sentence_count == 1:
      token += segments[index]['text']
      key = (start, segments[index]['end'])
      liquid_videos_data[id]['caption_token_data'][key] = {"text": token}
      sentence_count = num_sentences_per_token
      start = segments[index+1]['start']
      token = ""
    elif any(punctuation in segments[index]['text'] for punctuation in [".","?","!"]) and sentence_count > 1:
      token += segments[index]['text']
      sentence_count -= 1
    else:
      token += segments[index]['text']
    index+=1
    
  token += segments[index]['text']
  key = (start, segments[index]['end'])
  liquid_videos_data[id]['caption_token_data'][key] = {"text": token}

## Embedding Module

The following methods are used to get the embeddings for each token using the GPT-3 ```text-similarity-davinci-001``` model. 

In [94]:
import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt
openai.api_key = "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"
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_embedding(text: str, engine="text-similarity-davinci-001"):
    return openai.Embedding.create(input=[text], engine=engine)["data"][0]["embedding"]
def get_token_embedding(liquid_videos_data, id, num_sentences_per_token, directory_path):
  embeddings = []
  # if exists(f"{directory_path}/embedding_tokensize_{num_sentences_per_token}/{id}_embedding.text"):
  #   embeddings = np.loadtxt(f"{directory_path}/embedding_tokensize_{num_sentences_per_token}/{id}_embedding.text")
  # else:
  for key in liquid_videos_data[id]['caption_token_data']:
    embedding = get_embedding(liquid_videos_data[id]['caption_token_data'][key]['text'])
    liquid_videos_data[id]['caption_token_data'][key]['embedding'] = embedding
  os.makedirs(os.path.dirname(f"{directory_path}/embedding_tokensize_{num_sentences_per_token}/{id}_embedding.text"), exist_ok=True)
  np.savetxt(f"{directory_path}/embedding_tokensize_{num_sentences_per_token}/{id}_embedding.text", embeddings)
  return embeddings

## Clustering Module

The following methods are used to cluster the tokens and then determine a topic for each cluster.

In [17]:
def get_clusterings(liquid_videos_data, id,  n_clusters):
  keys = list(liquid_videos_data[id]['caption_token_data'].keys())
  keys.sort()
  embeddings = [liquid_videos_data[id]['caption_token_data'][key]['embedding'] for key in keys]
  kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
  kmeans.fit(embeddings)
  labels = kmeans.labels_
  for i in range(len(labels)):
    liquid_videos_data[id]['caption_token_data'][keys[i]]['cluster'] = labels[i]

In [11]:
def get_cluster_topic(liquid_videos_data, id):
  cluster_data = {}
  for key in liquid_videos_data[id]['caption_token_data']:
    if liquid_videos_data[id]['caption_token_data'][key]['cluster'] in cluster_data:
        cluster_data[liquid_videos_data[id]['caption_token_data'][key]['cluster']]['summary'] += liquid_videos_data[id]['caption_token_data'][key]['text']
    else:
        cluster_data[liquid_videos_data[id]['caption_token_data'][key]['cluster']] = {'summary': liquid_videos_data[id]['caption_token_data'][key]['text']}
  for cluster in cluster_data:
    summary = cluster_data[cluster]['summary']
    prompt = f"key idea of this paragraph in three words. \n {summary}"
    key_topic = openai.Completion.create(engine="text-davinci-002", prompt=prompt, temperature=0.7, max_tokens=32,top_p=0.9)
    key_topic = key_topic["choices"][0]["text"]
    key_topic = key_topic.replace(" ", "_")
    summary = cluster_data[cluster]['topic'] = key_topic
  return cluster_data

## Video Segmentation Module

The following module uses the cluster information from the cluster module to generate video clips for each cluster using the algorithm defined in the ```reduce_cluster_clip_windows``` method.

In [12]:
def reduce_cluster_clip_windows(windows):
  clips = []
  start = windows[0][0]
  end = windows[0][1]
  for i in range(len(windows)):
    if windows[i][0] - end > 30:
      clips.append((start,end))
      start = windows[i][0]
      end = windows[i][1]
      if i == len(windows)-1:
        clips.append(windows[i])
    else:
      end = windows[i][1]
      if i == len(windows)-1:
          clips.append((start,end))
  return clips

def get_cluster_clips_windows(liquid_videos_data, id):
  clip_windows = sorted(list(liquid_videos_data[id]['caption_token_data'].keys()))
  for window in clip_windows:
    window_cluster = liquid_videos_data[id]['caption_token_data'][window]['cluster']
    if 'clip_windows' in  liquid_videos_data[id]['cluster_data'][window_cluster]:
      liquid_videos_data[id]['cluster_data'][window_cluster]['clip_windows'].append(window)
    else:
      liquid_videos_data[id]['cluster_data'][window_cluster]['clip_windows'] = [window]
  for cluster in liquid_videos_data[id]['cluster_data']:
    liquid_videos_data[id]['cluster_data'][cluster]['clip_windows'] = reduce_cluster_clip_windows(liquid_videos_data[id]['cluster_data'][cluster]['clip_windows'])

In [13]:
def create_cluster_clips(liquid_videos_data, id, video_path, output_path):
  #Create path for clips
  title = ''.join(c for c in liquid_videos_data[id]['metadata']['title'] if c.isalpha())
  clips_ouput_path = f"{output_path}/{id}_{title}"
  if not os.path.exists(clips_ouput_path):
    os.makedirs(clips_ouput_path)

  #Create clips
  for clustor in liquid_videos_data[id]['cluster_data']:
    print(liquid_videos_data[id]['cluster_data'][clustor]['clip_windows'])
    for clip_window in liquid_videos_data[id]['cluster_data'][clustor]['clip_windows']:
      topic_name = ''.join(c for c in liquid_videos_data[id]['cluster_data'][clustor]['topic'] if c.isalpha())
      output_clip_path = f"{clips_ouput_path}/{clustor}_{topic_name}_{uuid.uuid1()}.mp4"
      if not exists(output_clip_path):
        ffmpeg_extract_subclip(video_path, clip_window[0], clip_window[1], targetname=output_clip_path)

## Run Backend Pipleine

The following method takes in a list of Youtube video URLs, runs the entire Liquid News Backed V1.0 pipeline on them, and stores the clustering it finds in your drive.

In [None]:
def run_liquid_news_pipeline(video_list, run_name):
  liquid_videos_data = {}
  for video_url in video_list:
    # Generate Path for Run in LiquidNews Folder
    os.mkdir(f'/content/drive/MyDrive/LiquidNews/{run_name}')    

    # Generate Path for video storage and audio storage in the "../LiquidNews/Path" folder
    os.mkdir(f'/content/drive/MyDrive/LiquidNews/{run_name}/videos')    
    os.mkdir(f'/content/drive/MyDrive/LiquidNews/{run_name}/audios')  
    os.mkdir(f'/content/drive/MyDrive/LiquidNews/{run_name}/transcriptions')  
    os.mkdir(f'/content/drive/MyDrive/LiquidNews/{run_name}/embeddings')
    os.mkdir(f'/content/drive/MyDrive/LiquidNews/{run_name}/clustor_clips') 

    # Download the video, and audio and retrieve the metadata
    download_video(video_url, f'/content/drive/MyDrive/LiquidNews/{run_name}/videos')
    download_video_audio(video_url, f'/content/drive/MyDrive/LiquidNews/{run_name}/audios')
    title, id = get_video_information(video_url)

    # Initialize children dictionaries & store metadata
    liquid_videos_data[id] = {}
    liquid_videos_data[id]["metadata"] = {}
    liquid_videos_data[id]["caption_token_data"] = {}
    liquid_videos_data[id]["metadata"]["title"] = title

    # Transcribe audio and store it as SRT file in the drive
    transcription = audio_to_caption(f'/content/drive/MyDrive/LiquidNews/{run_name}/audios/{id}.mp3', "", whisper_model)
    with open(f'/content/drive/MyDrive/LiquidNews/{run_name}/transcriptions/{id}.srt', "w", encoding="utf-8") as srt:
        write_srt(transcription["segments"], file=srt)

    # Tokenize the transcription and store it in the liquid_videos_data dictionary.
    # (Note we use a hard-coded token size of 4 or V1.0. We plan to us ML to identify
    # the optimal one in future versions).
    # MODIFIES THE EXISTING liquid_videos_data VARIABLE
    caption_tokenizer(transcription["segments"], 2, id, liquid_videos_data)

    # Gets embedding for each token and downloads embeddings to drive for the future.
    # MODIFIES THE EXISTING liquid_videos_data VARIABLE
    embeddings = get_token_embedding(liquid_videos_data, id, 2, 
                                     f'/content/drive/MyDrive/LiquidNews/{run_name}/embeddings')

    # Clusters the embeddings and updates their respective clustering in the liquid_videos_data dict
    # MODIFIES THE EXISTING liquid_videos_data VARIABLE
    get_clusterings(liquid_videos_data, id,  2)

    # Gets the associated topic title (string) for each cluster and stores it in the liquid_videos_data dict
    cluster_data = get_cluster_topic(liquid_videos_data, id)
    liquid_videos_data[id]['cluster_data'] = cluster_data

    # Identifies how to segment videos and then creates clips for each segment
    get_cluster_clips_windows(liquid_videos_data, id)
    create_cluster_clips(liquid_videos_data, id, 
                         f'/content/drive/MyDrive/LiquidNews/{run_name}/videos/{id}.mp4', 
                         f'/content/drive/MyDrive/LiquidNews/{run_name}/clustor_clips')

    # Download the final liquid_videos_data dict for future use
    dump_path = f'/content/drive/MyDrive/LiquidNews/{run_name}/liquid_videos_data.pkl'
    afile = open(dump_path, 'wb')
    pickle.dump(liquid_videos_data, afile)
    afile.close()

run_liquid_news_pipeline(['https://www.youtube.com/watch?v=HB2mGo_0tgA&ab_channel=ABCNews'], "apple")