In [60]:
import requests
from urllib.parse import urlparse
from urllib.parse import parse_qs
import random
from io import StringIO
import re
import json
from typing import List, Dict

import webvtt


API_MEDIATOR = 'https://b.jw-cdn.org/apis/mediator/v1/media-items/{lang_code}/{lank}?clientType=www'
API_DOCID = 'https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?output=json&fileformat=mp4&alllangs=1&langwritten={lang_code}&txtCMSLang={lang_code}&docid={docid}&track=1'
API_PUB =   'https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?output=json&fileformat=mp4&alllangs=1&langwritten={lang_code}&txtCMSLang={lang_code}&pub={pub}&track={track}'
wol_link_api_url = "https://b.jw-cdn.org/apis/wol-link"
API_MEDIATOR_CAT = 'https://b.jw-cdn.org/apis/mediator/v1/categories/{lang_code}/{category}?detailed=1&clientType=www'
_GET_CATEGORY = 'https://www.jw.org/es/biblioteca/videos/?item=pub-jwb-082_1_VIDEO&appLanguage=S'
SEARCH = 'https://data.jw-api.org/search/query'
SAMPLE_URL = 'https://www.jw.org/finder?srcid=jwlshare&wtlocale=S&lank=pub-jwb-082_1_VIDEO'
SAMPLE_URL2 = 'https://www.jw.org/es/biblioteca/videos/#es/mediaitems/StudioMonthlyPrograms/pub-jwb-082_1_VIDEO'
SECCION_DE_VIDEOS = '[sección de videos](https://www.jw.org/es/biblioteca/videos)'
URL_LANGUAGES = 'https://data.jw-api.org/mediator/v1/languages/S/all'



In [56]:
# Languages
languages = requests.get(URL_LANGUAGES).json()['languages']
custom = {lang['code']: lang['vernacular'] for lang in languages}
json.dump(custom, open('languages.json', 'w'), ensure_ascii=False, indent=2)
all_langs = json.load(open('languages.json'))

In [63]:
class JWVideo:
    def __init__(
            self,
            title: str = None,
            url_vtt: str = None,
            url_video: str = None,
            url_image: str = None,
            duration: float = None,
            label: str = None,
            subtitled: bool = None,
            pub: str = None,
            track: int = None
            ):
        self.title = title
        self.url_vtt = url_vtt
        self.url_video = url_video
        self.url_image = url_image
        self.duration = duration
        self.label = label
        self.subtitled = subtitled
        self.pub = pub
        self.track = track
    
    def __str__(self):
        return f'JWVideo {self.title!r}, label: {self.label!r}, subtitled: {self.subtitled!r}' + (f', pub: {self.pub!r}' if self.pub else '')


class Media:
    def __init__(self):
        self._data = None

    @property
    def data(self):
        if self._data is not None:
            return self._data
        else:
            self.data = requests.get(self.url_api).json()
            return self._data

    @data.setter
    def data(self, value):
        self._data = self._data or value


class MediaLank(Media):
    def __init__(self, lang_code, lank):
        self._lang_code = lang_code
        self._lank = lank
        self._data = None

    @property
    def url_api(self) -> str:
        return API_MEDIATOR.format(lang_code=self._lang_code, lank=self._lank)
    
    @property
    def jwvideos(self) -> List[JWVideo]:
        return [
            JWVideo(
                title=self.data['media'][0]['title'],
                url_vtt=file['subtitles']['url'],
                url_video=file['progressiveDownloadURL'],
                url_image=self.data['media'][0]['images']['lss']['lg'],
                duration=file['duration'],
                label=file['label'],
                subtitled=file['subtitled']
            )   for file in self.data['media'][0]['files']
        ]
    
    @property
    def available_languages(self) -> Dict[str, str]:
        return {code: all_langs[code] for code in self.data['media'][0]['availableLanguages']}



class MediaPub(Media):
    def __init__(self, lang_code, pub, track=''):
        self._lang_code = lang_code
        self._pub = pub
        self._track = track
        self._data = None

    @property
    def url_api(self) -> str:
        return API_PUB.format(lang_code=self._lang_code, pub=self._pub, track=self._track)

    @property
    def jwvideos(self) -> List[JWVideo]:
        return [
            JWVideo(
                title=file['title'],
                url_vtt=file.get('subtitles', dict()).get('url', None),
                url_video=file['file']['url'],
                url_image=file['trackImage']['url'],
                duration=file['duration'],
                label=file['label'],
                subtitled=file['subtitled'],
                pub=self.data['pub'],
                track=file.get('track')
            ) for file in self.data['files'][self._lang_code]['MP4']
        ]
    
    @property
    def available_languages(self) -> Dict[str, str]:
        return {code: info['name'] for code, info in self.data['languages'].items()}
        


# https://www.jw.org/finder?srcid=share&wtlocale=S&lank=pub-jwbcov_201705_5_VIDEO
media = MediaPub('S', 'apv')
media.available_languages



{'S': 'español',
 'E': 'inglés',
 'ABB': 'abé',
 'ABK': 'abjasio',
 'AU': 'abua',
 'ACH': 'achí',
 'AC': 'acholi',
 'AF': 'afrikáans',
 'AHN': 'ahanta',
 'AP': 'aimara',
 'AKA': 'akha',
 'AL': 'albanés',
 'X': 'alemán',
 'XSW': 'alemán suizo',
 'ALT': 'altaico',
 'ALU': 'aluro',
 'AM': 'amárico',
 'AI': 'amis',
 'AMG': 'amuzgo de Guerrero',
 'A': 'árabe',
 'AEY': 'árabe de Egipto',
 'ASN': 'árabe sudanés',
 'REA': 'armenio',
 'R': 'armenio occidental',
 'KRB': 'armenio oriental',
 'AE': 'asamés',
 'AHK': 'asháninka',
 'IE': 'ateso',
 'ATI': 'atié',
 'AKN': 'aucano',
 'AJG': 'ayá',
 'AJR': 'azerí',
 'LWX': 'bajo alemán',
 'BAK': 'bashkir',
 'BS': 'bassa de Camerún',
 'BA': 'bassa de Liberia',
 'BTK': 'batak simalunguno',
 'AK': 'batako karo',
 'BT': 'batako toba',
 'AO': 'baulé',
 'BI': 'becol',
 'BE': 'bengalí',
 'ET': 'bete',
 'IK': 'biak',
 'BU': 'birmano',
 'LM': 'bislama',
 'BO': 'boulou',
 'BL': 'búlgaro',
 'KBV': 'caboverdés',
 'CQ': 'cakchiquel occidental',
 'CB': 'camboyano',
 

In [65]:
lank = MediaLank('S', 'pub-jwbcov_201705_5_VIDEO')
lank.available_languages

{'A': 'العربية',
 'ABB': 'Abɛ',
 'ABK': 'аԥсуа',
 'AC': 'Acholi',
 'ACH': 'achi',
 'AF': 'Afrikaans',
 'AH': 'Kachin',
 'AHN': 'Aɣɩnda',
 'AJG': 'Aja',
 'AJR': 'Azərbaycan',
 'AK': 'Batak (Karo)',
 'AKA': 'Akha',
 'AKN': 'Okanisitongo',
 'AL': 'shqip',
 'ALU': 'Alur',
 'AM': 'አማርኛ',
 'AMG': 'ñoomndaa',
 'AN': 'català',
 'AO': 'Wawle',
 'AP': 'Aymara',
 'AR': 'Bamanankan',
 'ASL': 'American Sign Language',
 'ATI': 'Akie',
 'AU': 'Abua',
 'AUS': 'Auslan (Australian Sign Language)',
 'AW': 'ciTshwa',
 'AZ': 'қазақ',
 'B': 'čeština',
 'BA': 'Ɓǎsɔ́ɔ̀ (Ɖàbíɖà)',
 'BE': 'বাংলা',
 'BI': 'Bicol',
 'BIM': 'Bahasa Isyarat Malaysia',
 'BL': 'български',
 'BM': 'Dschang',
 'BO': 'Bulu',
 'BQ': 'Euskara',
 'BS': 'Basaa (Kamerun)',
 'BSL': 'British Sign Language',
 'BT': 'Batak (Toba)',
 'BU': 'မြန်မာ',
 'BVL': 'lengua de señas boliviana',
 'BZK': 'Bileez Kriol',
 'C': 'hrvatski',
 'CA': 'Shona',
 'CB': 'ខ្មែរ',
 'CBS': 'lenguaje de señas cubano',
 'CE': 'Kreol Morisien',
 'CG': 'Chitonga',
 'CGM

In [None]:

class Subtitles:
    def __init__(self, jwurl=None, lang_code=None, lank=None, docid=None, pub=None, track=None):
        if jwurl:
            lang_code, lank, docid, pub, track = self.parse_jwurl()

        print(url_api)
        return
        data = requests.get(url_api).json()


        self.url_subtitles = self.get_url_subtitles()
        self.text_subtitles = self.get_text_subtitles(self.url_subtitles)
        self.text_transcription = self.get_text_transcription(self.text_subtitles, self.title)
        self.photo = None

    def parse_jwurl(self):
        up = urlparse(self.url)
        pq = parse_qs(up.query)
        lang_code = lank = docid = pub = track = None
        if up.path == '/finder':
            lang_code = pq['wtlocale'][0]
            lank = pq.get('lank', [''])[0]
            docid = pq.get('docid', [''])[0]
        elif '/mediaitems/' in up.fragment:
            locale, _, _, self.lank = up.fragment.split('/')
            lang_code = self.get_lang_code(self.locale)
        elif 'jw.org' in up.netloc:
            self.locale = up.path.split('/')[1]
            self.lang_code = self.get_lang_code(self.locale)
        else:
            raise ValueError(f'{self.url!r} no es un enlace válido')
        return lang_code, lank, docid, pub, track

    @staticmethod
    def get_lang_code(locale):
        for language in LANGUAGES:
            if language['locale'] == locale:
                return language['code']
        else:
            raise ValueError(f'No existe el idioma locale={locale}')

    def get_language_name(self):
        for language in LANGUAGES:
            if language['code'] == self.lang_code:
                return language['name']
        else:
            raise ValueError(f'No existe el idioma code={self.lang_code}')
    
    def get_language_vernacular(self):
        for language in LANGUAGES:
            if language['code'] == self.lang_code:
                return language['vernacular'].capitalize()
        else:
            raise ValueError(f'No existe el idioma code={self.lang_code}')

    def generate_jwurl(self):
        base = f'https://www.jw.org/finder?srcid=share&wtlocale={self.lang_code}'
        return base + f'&lank={self.lank}' if self.lank else f'&docid={self.docid}'

    def _subs_from_json(self):
        if self.lank:
            url = API_MEDIATOR.format(lang_code=self.lang_code, lank=self.lank)
            data = requests.get(url).json()
        elif self.docid:
            url = API_PUBMEDIA.format(lang_code=self.lang_code, docid=self.docid)
            data = requests.get(url).json()
        else:
            raise PubMediaNotFound
        if not data['media']:
            raise PubMediaNotFound
        logger.info('[%s %s %s](%s)', self.lang_code, self.lank, self.docid, url)
        self.title = data['media'][0]['title']
        for file in data['media'][0]['files']:
            if file.get('subtitles'):
                break
        else:
            raise SubtitleNotFound
        return file['subtitles']['url']

    def _subs_from_scrappy(self):
        browser = mechanicalsoup.StatefulBrowser(user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36')
        response = browser.open(self.url)
        response = browser.page.find

    def get_url_subtitles(self):
        try:
            return self._subs_from_json()
        except PubMediaNotFound:
            pass

    def get_any_image(self):
        images = []
        for v in self.data['media'][0]['images'].values():
            [images.append(url) for url in v.values()]
        return random.choice(images)
        
    def get_image(self):
        try:
            return self.data['media'][0]['images']['pnr']['lg']
        except KeyError:
            return self.get_any_image()

    def get_availables_langs(self):
        return self.data['media'][0]['availableLanguages']
    
    @staticmethod
    def get_text_subtitles(url_subs):
        return requests.get(url_subs).content.decode()

    @staticmethod
    def get_text_transcription(subtitles='', title=''):
        transcription = title + '\n\n\n' if title else ''
        for caption in webvtt.read_buffer(StringIO(subtitles)):
            transcription += caption.text.replace('\n', ' ') + '\n'
            if caption.text.strip().endswith('.'):
                transcription += '\n'
        return transcription
