In [3]:
import nltk
import pandas as pd
import re

from collections import Counter
from gensim import utils
from gensim.models import doc2vec
from nltk.corpus import stopwords
#from pydocumentdb import document_client
from pymongo import MongoClient

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/venug/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data loading

In [4]:
client = MongoClient('mongodb://localhost:27017/')
db = client['ted']
coll = db['talks']

# Exploratory data analysis

Transcript for the first talk.

In [6]:
all_talks = list(coll.find({}))
all_talks[0]['transcript']

[{'text': 'Visible from space,', 'time': 687},
 {'text': 'the Okavango Delta', 'time': 1866},
 {'text': "is Africa's largest remaining\nintact wetland wilderness.",
  'time': 3517},
 {'text': 'This shining delta in landlocked Botswana\nis the jewel of the Kalahari,',
  'time': 8255},
 {'text': "more valuable than diamonds\nto the world's largest diamond producer",
  'time': 14120},
 {'text': 'and celebrated in 2014', 'time': 18573},
 {'text': "as our planet's 1000th\nUNESCO World Heritage Site.",
  'time': 20986},
 {'text': 'Now, what you see here\nare the two major tributaries,',
  'time': 24495},
 {'text': 'the Cuito and the Cubango,', 'time': 27424},
 {'text': 'disappearing up north\ninto the little-known Angolan highlands.',
  'time': 29465},
 {'text': 'This is the largest undeveloped\nriver basin on the planet,',
  'time': 33615},
 {'text': 'spanning an area larger than California.', 'time': 37938},
 {'text': 'These vast, undeveloped Angolan\nwatersheds were frozen in time',
  'ti

Extract and join text only.

In [7]:
print(' '.join([x['text'].replace('\n', ' ') for x in all_talks[0]['transcript']]))

Visible from space, the Okavango Delta is Africa's largest remaining intact wetland wilderness. This shining delta in landlocked Botswana is the jewel of the Kalahari, more valuable than diamonds to the world's largest diamond producer and celebrated in 2014 as our planet's 1000th UNESCO World Heritage Site. Now, what you see here are the two major tributaries, the Cuito and the Cubango, disappearing up north into the little-known Angolan highlands. This is the largest undeveloped river basin on the planet, spanning an area larger than California. These vast, undeveloped Angolan watersheds were frozen in time by 27 years of civil war. In fact, Africa's largest tank battle since World War II was fought over a bridge crossing the Okavango's Cuito River. There on the right, disappearing off into the unknown, into the "Terra do fim do mundo" -- the land at the end of the earth, as it was known by the first Portuguese explorers. In 2001, at the age of 22, I took a job as head of housekeepin

## Top 20 lines

In [8]:
line_counter = Counter(line['text'] for talk in all_talks for line in talk.get('transcript', []))

In [9]:
line_counter.most_common(20)

[('(Laughter)', 3398),
 ('(Applause)', 2133),
 ('Thank you.', 820),
 ('Thank you very much.', 140),
 ('(Music)', 118),
 ('(Laughter) (Applause)', 52),
 ('Thank you. (Applause)', 50),
 ('Why?', 38),
 ('(Cheers)', 30),
 ('Right?', 29),
 ('No.', 25),
 ('Thank you so much.', 25),
 ('Yeah.', 23),
 ('For example,', 21),
 ('So --', 17),
 ("But here's the thing:", 17),
 ('Thanks.', 16),
 ('(Applause ends)', 16),
 ('OK.', 15),
 ('around the world.', 15)]

Extract all "comments" (in round brackets) and find the most common.

In [10]:
all_transcripts_joined = ' '.join(
    ' '.join(line['text'].replace('\n', ' ') for line in talk.get('transcript', []))
    for talk in all_talks
)

In [11]:
comment_counter = Counter(re.findall(r'(\(.+?\))', all_transcripts_joined))

In [12]:
comment_counter.most_common(20)

[('(Laughter)', 3864),
 ('(Applause)', 2366),
 ('(Music)', 135),
 ('(Video)', 132),
 ('(Audio)', 50),
 ('(Cheers)', 33),
 ('(Audience)', 26),
 ('(Laughs)', 22),
 ('(Singing)', 17),
 ('(Recording)', 17),
 ('(Applause ends)', 16),
 ('(English)', 14),
 ('(Guitar)', 11),
 ('(Arabic)', 10),
 ('(Music ends)', 9),
 ('(Clicking)', 8),
 ('(Cheering)', 7),
 ('(Sigh)', 7),
 ('(Ball squeaks)', 7),
 ('(Cheers and applause)', 6)]

# Data preparation

Extract all transcripts.

In [13]:
all_transcripts = [
    (talk['id'], ' '.join(line['text'].replace('\n', ' ') for line in talk.get('transcript', [])))
    for talk in all_talks
]

Filter out talks with no transcript.

In [14]:
all_transcripts = [(talk_id, transcript)
                   for talk_id, transcript in all_transcripts
                   if len(transcript) != 0]

In [15]:
sorted(all_transcripts, key=lambda x: len(x[1]))[:10]

[('1677', '(Mechanical noises) (Music) (Applause)'),
 ('2366',
  '(Guitar music starts) (Music ends) (Applause) (Distorted guitar music starts) (Music ends) (Applause) (Ambient/guitar music starts) (Music ends) (Applause)'),
 ('2242',
  'Isadora Duncan -- (Music) -- crazy, long-legged woman from San Francisco, got tired of this country, and she wanted to get out. Isadora was famous somewhere around 1908 for putting up a blue curtain, and she would stand with her hands over her solar plexus and she would wait, and she would wait, and then, she would move. (Music) Josh and I and Somi call this piece "The Red Circle and the Blue Curtain." Red circle. Blue curtain. But, this is not the beginning of the 20th century. This is a morning in Vancouver in 2015. (Music) (Singing) Come on, Josh! (Music) (Singing) Go! Are we there yet? I don\'t think so. Hey, yeah! (Music) What time is it? (Music) Where are we? Josh. Somi. Bill T. Josh. Somi. Bill T. (Applause) Yeah, yeah!'),
 ('2611',
  '(Music) I

Filter out music.

In [16]:
all_transcripts = [(talk_id, transcript)
                   for talk_id, transcript in all_transcripts
                   if '(Music)' not in transcript and '(Music ends)' not in transcript]

In [17]:
sorted(all_transcripts, key=lambda x: len(x[1]))[:10]

[('2701',
  '(Guitar) (Singing) Rollercoaster, carousel. Where the highs are heaven, but the lows, oh, they can be hell. You can grab the ring, you can ring that bell, when the ride is over, you can never tell. People tell you this one thing -- will make your life complete. So you, you give it everything you got and you wind up on the street. Then one day you wake up, and they tell you "you\'re a queen," but then you find that someone else is pulling on the strings. Rollercoaster, carousel. Where the highs are heaven, but the lows, oh, they can be hell. You can grab the ring, you can ring that bell, when the ride is over, you can never tell. The one you love, they love you -- oh yeah -- until the end of time. But lose your edge or lose your cool, they will drop you like a dime. Everyone is crowding \'round when fortune is your friend. When your luck is running out, you\'re all alone again. Rollercoaster, carousel. Where the highs are heaven but the lows, oh, they can be hell. You can g

Almost there...

In [18]:
all_transcripts = [(talk_id, transcript)
                   for talk_id, transcript in all_transcripts
                   if '♫' not in transcript]

In [19]:
sorted(all_transcripts, key=lambda x: len(x[1]))[:10]

[('2701',
  '(Guitar) (Singing) Rollercoaster, carousel. Where the highs are heaven, but the lows, oh, they can be hell. You can grab the ring, you can ring that bell, when the ride is over, you can never tell. People tell you this one thing -- will make your life complete. So you, you give it everything you got and you wind up on the street. Then one day you wake up, and they tell you "you\'re a queen," but then you find that someone else is pulling on the strings. Rollercoaster, carousel. Where the highs are heaven, but the lows, oh, they can be hell. You can grab the ring, you can ring that bell, when the ride is over, you can never tell. The one you love, they love you -- oh yeah -- until the end of time. But lose your edge or lose your cool, they will drop you like a dime. Everyone is crowding \'round when fortune is your friend. When your luck is running out, you\'re all alone again. Rollercoaster, carousel. Where the highs are heaven but the lows, oh, they can be hell. You can g

Remove all "comments".

In [20]:
comment_re = '|'.join([k.replace('(', '\(').replace(')', '\)') for k, v in comment_counter.most_common(20)])

In [21]:
comment_re

'\\(Laughter\\)|\\(Applause\\)|\\(Music\\)|\\(Video\\)|\\(Audio\\)|\\(Cheers\\)|\\(Audience\\)|\\(Laughs\\)|\\(Singing\\)|\\(Recording\\)|\\(Applause ends\\)|\\(English\\)|\\(Guitar\\)|\\(Arabic\\)|\\(Music ends\\)|\\(Clicking\\)|\\(Cheering\\)|\\(Sigh\\)|\\(Ball squeaks\\)|\\(Cheers and applause\\)'

In [22]:
all_transcripts = [
    (talk_id, re.sub(comment_re, ' ', transcript))
    for talk_id, transcript in all_transcripts
]

Define a function to extract words.

In [23]:
english_stopwords = set(stopwords.words('english'))

def extract_words(text):
    text = re.sub(r'[^A-Za-z/\'\-]', ' ', text)
    text = [utils.to_unicode(w)
            for w in text.lower().split()
            if w not in english_stopwords]
    return text

In [24]:
extract_words(all_transcripts[0][1])

['visible',
 'space',
 'okavango',
 'delta',
 "africa's",
 'largest',
 'remaining',
 'intact',
 'wetland',
 'wilderness',
 'shining',
 'delta',
 'landlocked',
 'botswana',
 'jewel',
 'kalahari',
 'valuable',
 'diamonds',
 "world's",
 'largest',
 'diamond',
 'producer',
 'celebrated',
 "planet's",
 'th',
 'unesco',
 'world',
 'heritage',
 'site',
 'see',
 'two',
 'major',
 'tributaries',
 'cuito',
 'cubango',
 'disappearing',
 'north',
 'little-known',
 'angolan',
 'highlands',
 'largest',
 'undeveloped',
 'river',
 'basin',
 'planet',
 'spanning',
 'area',
 'larger',
 'california',
 'vast',
 'undeveloped',
 'angolan',
 'watersheds',
 'frozen',
 'time',
 'years',
 'civil',
 'war',
 'fact',
 "africa's",
 'largest',
 'tank',
 'battle',
 'since',
 'world',
 'war',
 'ii',
 'fought',
 'bridge',
 'crossing',
 "okavango's",
 'cuito',
 'river',
 'right',
 'disappearing',
 'unknown',
 'terra',
 'fim',
 'mundo',
 '--',
 'land',
 'end',
 'earth',
 'known',
 'first',
 'portuguese',
 'explorers',
 '

# Modelling

Create a list of `TaggedDocument`s.

In [25]:
tagged_docs = [
    doc2vec.TaggedDocument(extract_words(transcript), [talk_id])
    for talk_id, transcript in all_transcripts
]

In [26]:
tagged_docs[0]

TaggedDocument(words=['visible', 'space', 'okavango', 'delta', "africa's", 'largest', 'remaining', 'intact', 'wetland', 'wilderness', 'shining', 'delta', 'landlocked', 'botswana', 'jewel', 'kalahari', 'valuable', 'diamonds', "world's", 'largest', 'diamond', 'producer', 'celebrated', "planet's", 'th', 'unesco', 'world', 'heritage', 'site', 'see', 'two', 'major', 'tributaries', 'cuito', 'cubango', 'disappearing', 'north', 'little-known', 'angolan', 'highlands', 'largest', 'undeveloped', 'river', 'basin', 'planet', 'spanning', 'area', 'larger', 'california', 'vast', 'undeveloped', 'angolan', 'watersheds', 'frozen', 'time', 'years', 'civil', 'war', 'fact', "africa's", 'largest', 'tank', 'battle', 'since', 'world', 'war', 'ii', 'fought', 'bridge', 'crossing', "okavango's", 'cuito', 'river', 'right', 'disappearing', 'unknown', 'terra', 'fim', 'mundo', '--', 'land', 'end', 'earth', 'known', 'first', 'portuguese', 'explorers', 'age', 'took', 'job', 'head', 'housekeeping', 'vundumtiki', 'camp',

Define and train a Doc2Vec model.

In [27]:
model = doc2vec.Doc2Vec(
    documents=tagged_docs,
    epochs=100,
    min_count=2,
    seed=42,
    vector_size=100,
    window=10,
    workers=8
)

Define a function to retrieve the title of a talk.

In [28]:
get_title = lambda talk_id: next(x['title'] for x in all_talks if x['id'] == str(talk_id))

Define a function to retrieve the `n` most similar talks to a given talk.

In [29]:
def most_similar(talk_id, n=10):
    return pd.DataFrame([
        (similar_id, get_title(similar_id), similarity)
        for similar_id, similarity in model.docvecs.most_similar([str(talk_id)], topn=n)
    ], columns=['id', 'title', 'similarity'])

In [33]:
get_title('19330')

"How we're saving one of Earth's last wild places"

In [34]:
most_similar('19330')

Unnamed: 0,id,title,similarity
0,14485,The discoveries awaiting us in the ocean's twi...,0.461475
1,2673,Help discover ancient ruins -- before it's too...,0.46028
2,2127,Humble plants that hide surprising secrets,0.451526
3,2359,"Deep under the Earth's surface, discovering be...",0.450523
4,12370,Why I choose humanism over faith,0.434969
5,2088,The magic of the Amazon: A river that flows in...,0.430794
6,2334,Stunning photos of the endangered Everglades,0.428524
7,2663,"To solve old problems, study new species",0.424646
8,2872,The fascinating secret lives of giant clams,0.42098
9,2367,The coolest animal you know nothing about ... ...,0.4197


## Storing model results

Define a function to compute all pairwise similarities for a given talk.

In [35]:
talk_ids = [talk_id for talk_id, _ in all_transcripts]

def similarities(talk_id):
    distances = model.docvecs.distances(talk_id)
    return [
        (id_, float(similarity))
        for id_, similarity in zip(talk_ids, 1 - distances)
        if id_ != talk_id
    ]

In [36]:
similarities('19330')[:10]

[('19756', 0.16095280647277832),
 ('16782', 0.19947850704193115),
 ('19461', 0.3286566734313965),
 ('17697', 0.18824529647827148),
 ('17711', -0.048494815826416016),
 ('17922', 0.07260465621948242),
 ('17846', 0.0731816291809082),
 ('17713', 0.21339309215545654),
 ('5451', 0.07169729471206665),
 ('17851', 0.23655039072036743)]

Create a list of documents to upload to Cosmos DB.

In [37]:
similarity_docs = [
    {
        'id': talk_id,
        'similarities': [{
            'other_id': other_talk_id,
            'similarity': similarity,
        } for other_talk_id, similarity in similarities(talk_id)],
    }
    for talk_id in talk_ids
]

Create a new Cosmos DB collection for the documents.

In [42]:
dbcoll = db.list_collection_names()
if 'similarities' in dbcoll:
    db.similarities.drop()
#db.createcollection("similarities")
#similarities_coll = client.CreateCollection(db['_self'], {'id': 'similarities'})


Upload documents to Mongo DB.

In [43]:
for doc in similarity_docs:
    db['similarities'].insert_one(dict(doc))
    #client.CreateDocument(similarities_coll['_self'], doc)