In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction import text

In [2]:
cleaned_df = pd.read_csv('cleaned data/vaishak_data.csv')

In [3]:
cleaned_df.shape

(4016, 12)

# Recommendation using transcript

## Term Frequency-Inverse Document Frequency (Tf-Idf)

In [4]:
Text=cleaned_df['transcript'].tolist()
tfidf=text.TfidfVectorizer(input=Text,stop_words="english")
matrix=tfidf.fit_transform(Text)
print(matrix.shape)

(4016, 68448)


## Cosine Similarity

### Unigrams

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
sim_unigram=cosine_similarity(matrix)
sim_unigram.shape

(4016, 4016)

In [6]:
#https://www.kaggle.com/code/gunnvant/building-content-recommender-tutorial
#finding the most similar 5 ted talks using the correlation matrix of words created using cosine similarity
def get_similar_articles(x):
    return "-,".join((cleaned_df['talk']).loc[x.argsort()[-6:-1]])
cleaned_df['most_similar_transcript_unigrams']=[get_similar_articles(x) for x in sim_unigram]

In [7]:
cleaned_df.head(5)

Unnamed: 0,talk,speaker,speaker_occ,speaker_bio,talk_desc,event,views,duration,tags,recorded_at,published on,transcript,most_similar_transcript_unigrams
0,Can you outsmart the apples and oranges fallacy?,Elizabeth Cox,Unknown,Unknown,It's 1997. The United States Senate has called...,TED-Ed,119359,327,"['education', 'psychology', 'animation', 'TED-...",2021-04-05T00:00:00.000+00:00,1617636753,Baking apple pie? Discount orange warehouse ha...,Can clouds buy us more time to solve climate c...
1,The exploitation of US college athletes,Tim Nevius,College sports lawyer,Tim Nevius is a leading sports lawyer and coll...,Colleges and universities in the US make billi...,TEDxDayton,438573,611,"['sports', 'law', 'education', 'United States'...",2020-11-10T00:00:00.000+00:00,1617634131,"In college sports, American universities are e...","The ""opportunity gap"" in US public education -..."
2,How does ultrasound work?,Jacques Abramowicz,Unknown,Unknown,"In a dark cave, bats can't see much. But even ...",TED-Ed,140874,295,"['education', 'technology', 'animation', 'TED-...",2021-04-01T00:00:00.000+00:00,1617290223,"In a pitch-black cave, bats can’t see much. Bu...","The sonic boom problem-,How LIGO discovered gr..."
3,"An honest history of an ancient and ""nasty"" word",Kate Lister,Sex historian,Kate Lister is a sex historian and lecturers a...,"With candor and cunning, sex historian Kate Li...",TEDxUniversityofGlasgow,569477,1148,"['sex', 'language', 'ancient world', 'history'...",2020-03-06T00:00:00.000+00:00,1617289565,"First, a warning. As far as offensive words go...","Go ahead, make up new words!-,Beautiful new wo..."
4,The electrical blueprints that orchestrate life,Michael Levin,Bioelectric explorer,Michael Levin's research could give rise to ad...,DNA isn't the only builder in the biological w...,TED2020,122682,1176,"['biology', 'science', 'invention', 'robots', ...",2020-05-18T00:00:00.000+00:00,1617213773,"Chris Anderson: Mike, welcome. It's good to se...","Printing a human kidney-,The next software rev..."


In [8]:
example_talk = cleaned_df.iloc[0,0]
example_tags = cleaned_df.iloc[0,8]
example_recommendations_unigram = cleaned_df.iloc[0,12].split('-,')
print(example_recommendations_unigram)

['Can clouds buy us more time to solve climate change?', '100 solutions to reverse global warming', "How we could change the planet's climate future", 'Emergency medicine for our climate fever', 'Why I must speak out about climate change']


In [45]:
example_tags = cleaned_df.iloc[0,8]
result = [''.join(c for c in word if c.isalpha()) for word in example_tags.split(',')]
result

['education', 'psychology', 'animation', 'TEDEd', 'brain', 'climatechange']

In [43]:
print("Example : Recommendation using Transcript\n")
print("Talk name ->",example_talk)
print("Talk tags ->",example_tags)
print()
listA = [''.join(c for c in word if c.isalpha()) for word in example_tags.split(',')]
setA = set(listA)
print("Recommendations Unigram : ")
i = 0
print(setA)
for each in example_recommendations_unigram[::-1]:
    tagsB = cleaned_df['tags'][cleaned_df.talk==each].values
#     print(tagsB)
    listB = [''.join(c for c in word if c.isalpha()) for word in tagsB[0].split()]
    setB = set(listB)
    overlap = setA & setB
    universe = setA | setB

    result1 = round(float(len(overlap)) / len(setA) * 100,1)
    result2 = round(float(len(overlap)) / len(setB) * 100,1)
    result3 = round(float(len(overlap)) / len(universe) * 100,1)
    print(each)
#     print(result1, result2, result3)

  
# print(each,'->\t', cleaned_df['tags'][cleaned_df.talk==each].values,'\n')
print()

Example : Recommendation using Transcript

Talk name -> An election system that puts voters (not politicians) first
Talk tags -> ['education', 'psychology', 'animation', 'TED-Ed', 'brain', 'climate change']

Recommendations Unigram : 
{'education', 'TEDEd', 'animation', 'psychology', 'brain', 'climate', 'change'}
A bold idea to replace politicians
Can you solve the fantasy election riddle?
How (and why) Russia hacked the US election
How the new generation of Latinx voters could change US elections



In [18]:
cleaned_df[cleaned_df.talk=="Depression, the secret we share"]

Unnamed: 0,talk,speaker,speaker_occ,speaker_bio,talk_desc,event,views,duration,tags,recorded_at,published on,transcript,most_similar_transcript_unigrams
2486,"Depression, the secret we share",Andrew Solomon,Writer,"Andrew Solomon writes about politics, culture ...","""The opposite of depression is not happiness, ...",TEDxMet,11733589,1761,"['TEDx', 'culture', 'depression', 'medicine', ...",2013-10-23T00:00:00.000+00:00,1387382509,"""I felt a Funeral, in my Brain, and Mourners t...","How do antidepressants work?-,Don't suffer fro..."


### Bigrams

In [19]:
tfidf_bigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(2,2))
matrix_bigrams=tfidf_bigrams.fit_transform(Text)
matrix_bigrams.shape

(4016, 1881347)

In [20]:
sim_bigram=cosine_similarity(matrix_bigrams)
sim_bigram.shape

(4016, 4016)

In [21]:
cleaned_df['most_similar_transcript_bigrams']=[get_similar_articles(x) for x in sim_bigram]

In [22]:
cleaned_df.head(1)

Unnamed: 0,talk,speaker,speaker_occ,speaker_bio,talk_desc,event,views,duration,tags,recorded_at,published on,transcript,most_similar_transcript_unigrams,most_similar_transcript_bigrams
0,Can you outsmart the apples and oranges fallacy?,Elizabeth Cox,Unknown,Unknown,It's 1997. The United States Senate has called...,TED-Ed,119359,327,"['education', 'psychology', 'animation', 'TED-...",2021-04-05T00:00:00.000+00:00,1617636753,Baking apple pie? Discount orange warehouse ha...,Can clouds buy us more time to solve climate c...,"The case for optimism-,Can clouds buy us more ..."


### Trigrams

In [23]:
tfidf_trigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(3,3))
matrix_trigrams=tfidf_trigrams.fit_transform(Text)
matrix_trigrams.shape

(4016, 2893949)

In [24]:
sim_trigram=cosine_similarity(matrix_trigrams)
sim_trigram.shape

(4016, 4016)

In [25]:
cleaned_df['most_similar_transcript_trigrams']=[get_similar_articles(x) for x in sim_trigram]

In [26]:
example_talk = cleaned_df.iloc[9,0]
example_tags = cleaned_df.iloc[9,8]
example_recommendations_unigram = cleaned_df.iloc[9,12].split(',')
example_recommendations_bigram = cleaned_df.iloc[9,13].split(',')
example_recommendations_trigram = cleaned_df.iloc[9,14].split(',')

In [27]:
print("Example : Recommendation using Transcript\n")
print("Talk name ->",example_talk)
print("Talk tags ->",example_tags)
print()
print("Recommendations Unigram : ")
for each in example_recommendations_unigram[::-1]:
    print(each)
print()
print("Recommendations Bigram : ")
for each in example_recommendations_bigram[::-1]:
    print(each)
print()
print("Recommendations Trigram : ")
for each in example_recommendations_trigram[::-1]:
    print(each)
print()

    

Example : Recommendation using Transcript

Talk name -> An election system that puts voters (not politicians) first
Talk tags -> ['United States', 'democracy', 'politics', 'society', 'TEDx', 'policy', 'government']

Recommendations Unigram : 
What's needed to bring the US voting system into the 21st century
E-voting without fraud-
The fight for the right to vote in the United States-
Which voting system is the best?-
The unexpected challenges of a country's first election-

Recommendations Bigram : 
What's needed to bring the US voting system into the 21st century
E-voting without fraud-
How to gain control of your free time-
How much sleep do you really need?-
It’s our city. Let’s fix it-

Recommendations Trigram : 
What's needed to bring the US voting system into the 21st century
What makes life worth living in the face of death-
What if every satellite suddenly disappeared?-
The science of friction -- and its surprising impact on our lives-
The quest for the coronavirus vaccine-



In [28]:
cleaned_df.head(5)

Unnamed: 0,talk,speaker,speaker_occ,speaker_bio,talk_desc,event,views,duration,tags,recorded_at,published on,transcript,most_similar_transcript_unigrams,most_similar_transcript_bigrams,most_similar_transcript_trigrams
0,Can you outsmart the apples and oranges fallacy?,Elizabeth Cox,Unknown,Unknown,It's 1997. The United States Senate has called...,TED-Ed,119359,327,"['education', 'psychology', 'animation', 'TED-...",2021-04-05T00:00:00.000+00:00,1617636753,Baking apple pie? Discount orange warehouse ha...,Can clouds buy us more time to solve climate c...,"The case for optimism-,Can clouds buy us more ...",The material that could change the world... fo...
1,The exploitation of US college athletes,Tim Nevius,College sports lawyer,Tim Nevius is a leading sports lawyer and coll...,Colleges and universities in the US make billi...,TEDxDayton,438573,611,"['sports', 'law', 'education', 'United States'...",2020-11-10T00:00:00.000+00:00,1617634131,"In college sports, American universities are e...","The ""opportunity gap"" in US public education -...","Help for kids the education system ignores-,An...","An ultra-low-cost college degree-,The boost st..."
2,How does ultrasound work?,Jacques Abramowicz,Unknown,Unknown,"In a dark cave, bats can't see much. But even ...",TED-Ed,140874,295,"['education', 'technology', 'animation', 'TED-...",2021-04-01T00:00:00.000+00:00,1617290223,"In a pitch-black cave, bats can’t see much. Bu...","The sonic boom problem-,How LIGO discovered gr...",Why we need to stop obsessing over World War I...,"Averting the climate crisis-,Color-coded surge..."
3,"An honest history of an ancient and ""nasty"" word",Kate Lister,Sex historian,Kate Lister is a sex historian and lecturers a...,"With candor and cunning, sex historian Kate Li...",TEDxUniversityofGlasgow,569477,1148,"['sex', 'language', 'ancient world', 'history'...",2020-03-06T00:00:00.000+00:00,1617289565,"First, a warning. As far as offensive words go...","Go ahead, make up new words!-,Beautiful new wo...","The extraordinary power of ordinary people-,Wh...","The extraordinary power of ordinary people-,Do..."
4,The electrical blueprints that orchestrate life,Michael Levin,Bioelectric explorer,Michael Levin's research could give rise to ad...,DNA isn't the only builder in the biological w...,TED2020,122682,1176,"['biology', 'science', 'invention', 'robots', ...",2020-05-18T00:00:00.000+00:00,1617213773,"Chris Anderson: Mike, welcome. It's good to se...","Printing a human kidney-,The next software rev...","The radical possibilities of man-made DNA-,A c...","My creations, a new form of life-,It's time to..."


# Recommendation using Talk Description

## Term Frequency-Inverse Document Frequency (Tf-Idf)

In [29]:
Text=cleaned_df['talk_desc'].tolist()
tfidf=text.TfidfVectorizer(input=Text,stop_words="english")
matrix=tfidf.fit_transform(Text)
print(matrix.shape)

## Cosine Similarity

### Unigrams

from sklearn.metrics.pairwise import cosine_similarity
sim_unigram=cosine_similarity(matrix)
sim_unigram.shape

#https://www.kaggle.com/code/gunnvant/building-content-recommender-tutorial
#finding the most similar 5 ted talks using the correlation matrix of words created using cosine similarity
def get_similar_articles(x):
    return ",".join((cleaned_df['talk']).loc[x.argsort()[-5:-1]])
cleaned_df['most_similar_desc_unigrams']=[get_similar_articles(x) for x in sim_unigram]

cleaned_df.head(5)

### Bigrams

tfidf_bigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(2,2))
matrix_bigrams=tfidf_bigrams.fit_transform(Text)
matrix_bigrams.shape

sim_bigram=cosine_similarity(matrix_bigrams)
sim_bigram.shape

cleaned_df['most_similar_desc_bigrams']=[get_similar_articles(x) for x in sim_bigram]

cleaned_df.head(1).most_similar_desc_unigrams

### Trigrams

tfidf_trigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(3,3))
matrix_trigrams=tfidf_trigrams.fit_transform(Text)
matrix_trigrams.shape

sim_trigram=cosine_similarity(matrix_trigrams)
sim_trigram.shape

cleaned_df['most_similar_desc_trigrams']=[get_similar_articles(x) for x in sim_trigram]

example_talk = cleaned_df.iloc[9,0]
example_tags = cleaned_df.iloc[9,8]
example_recommendations_unigram = cleaned_df.iloc[9,15].split(',')
example_recommendations_bigram = cleaned_df.iloc[9,16].split(',')
example_recommendations_trigram = cleaned_df.iloc[9,17].split(',')

print("Example : Recommendation using Talk Descritption\n")
print("Talk name ->",example_talk)
print("Talk tags ->",example_tags)
print()
print("Recommendations Unigram : ")
for each in example_recommendations_unigram[::-1]:
    print(each)
print()
print("Recommendations Bigram : ")
for each in example_recommendations_bigram[::-1]:
    print(each)
print()
print("Recommendations Trigram : ")
for each in example_recommendations_trigram[::-1]:
    print(each)
print()

    

(4016, 21448)
Example : Recommendation using Talk Descritption

Talk name -> An election system that puts voters (not politicians) first
Talk tags -> ['United States', 'democracy', 'politics', 'society', 'TEDx', 'policy', 'government']

Recommendations Unigram : 
The unexpected challenges of a country's first election
Can democracy exist without trust?
What's needed to bring the US voting system into the 21st century
Does your vote count? The Electoral College explained

Recommendations Bigram : 
The fight for the right to vote in the United States
Does racism affect how you vote?
Averting the climate crisis
Why must artists be poor?

Recommendations Trigram : 
Averting the climate crisis
"my mama" / "BLACK BANANA"
How to inspire every child to be a lifelong reader
Why we should end animal agriculture



# Recommendation using Talk title

## Term Frequency-Inverse Document Frequency (Tf-Idf)

In [30]:
Text=cleaned_df['talk'].tolist()
tfidf=text.TfidfVectorizer(input=Text,stop_words="english")
matrix=tfidf.fit_transform(Text)
print(matrix.shape)

## Cosine Similarity

### Unigrams

from sklearn.metrics.pairwise import cosine_similarity
sim_unigram=cosine_similarity(matrix)
sim_unigram.shape

#https://www.kaggle.com/code/gunnvant/building-content-recommender-tutorial
#finding the most similar 5 ted talks using the correlation matrix of words created using cosine similarity
def get_similar_articles(x):
    return ",".join((cleaned_df['talk']).loc[x.argsort()[-5:-1]])
cleaned_df['most_similar_title_unigrams']=[get_similar_articles(x) for x in sim_unigram]

cleaned_df.head(5)

### Bigrams

tfidf_bigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(2,2))
matrix_bigrams=tfidf_bigrams.fit_transform(Text)
matrix_bigrams.shape

sim_bigram=cosine_similarity(matrix_bigrams)
sim_bigram.shape

cleaned_df['most_similar_title_bigrams']=[get_similar_articles(x) for x in sim_bigram]

cleaned_df.head(1).most_similar_title_unigrams

### Trigrams

tfidf_trigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(3,3))
matrix_trigrams=tfidf_trigrams.fit_transform(Text)
matrix_trigrams.shape

sim_trigram=cosine_similarity(matrix_trigrams)
sim_trigram.shape

cleaned_df['most_similar_title_trigrams']=[get_similar_articles(x) for x in sim_trigram]


example_talk = cleaned_df.iloc[9,0]
example_tags = cleaned_df.iloc[9,8]
example_recommendations_unigram = cleaned_df.iloc[9,18].split(',')
example_recommendations_bigram = cleaned_df.iloc[9,19].split(',')
example_recommendations_trigram = cleaned_df.iloc[9,20].split(',')

print("Example : Recommendation using Talk Title\n")
print("Talk name ->",example_talk)
print("Talk tags ->",example_tags)
print()
print("Recommendations Unigram : ")
for each in example_recommendations_unigram[::-1]:
    print(each)
print()
print("Recommendations Bigram : ")
for each in example_recommendations_bigram[::-1]:
    print(each)
print()
print("Recommendations Trigram : ")
for each in example_recommendations_trigram[::-1]:
    print(each)
print()


(4016, 4900)
Example : Recommendation using Talk Title

Talk name -> An election system that puts voters (not politicians) first
Talk tags -> ['United States', 'democracy', 'politics', 'society', 'TEDx', 'policy', 'government']

Recommendations Unigram : 
A bold idea to replace politicians
Can you solve the fantasy election riddle?
How (and why) Russia hacked the US election
How the new generation of Latinx voters could change US elections

Recommendations Bigram : 
Averting the climate crisis
"my mama" / "BLACK BANANA"
How to inspire every child to be a lifelong reader
Why we should end animal agriculture

Recommendations Trigram : 
Averting the climate crisis
"my mama" / "BLACK BANANA"
How to inspire every child to be a lifelong reader
Why we should end animal agriculture

