In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction import text

In [30]:
cleaned_df = pd.read_csv('cleaned data/vaishak_data.csv')

In [31]:
cleaned_df.shape

(4016, 12)

# Recommendation using transcript

## Term Frequency-Inverse Document Frequency (Tf-Idf)

In [32]:
Text=cleaned_df['transcript'].tolist()
tfidf=text.TfidfVectorizer(input=Text,stop_words="english")
matrix=tfidf.fit_transform(Text)
print(matrix.shape)

(4016, 68448)


## Cosine Similarity

### Unigrams

In [33]:
from sklearn.metrics.pairwise import cosine_similarity
sim_unigram=cosine_similarity(matrix)
sim_unigram.shape

(4016, 4016)

In [34]:
#https://www.kaggle.com/code/gunnvant/building-content-recommender-tutorial
#finding the most similar 5 ted talks using the correlation matrix of words created using cosine similarity
def get_similar_articles(x):
    return ",".join((cleaned_df['talk']).loc[x.argsort()[-5:-1]])
cleaned_df['most_similar_transcript_unigrams']=[get_similar_articles(x) for x in sim_unigram]

In [35]:
cleaned_df.head(5)

Unnamed: 0,talk,speaker,speaker_occ,speaker_bio,talk_desc,event,views,duration,tags,recorded_at,published on,transcript,most_similar_transcript_unigrams
0,Can you outsmart the apples and oranges fallacy?,Elizabeth Cox,Unknown,Unknown,It's 1997. The United States Senate has called...,TED-Ed,119359,327,"['education', 'psychology', 'animation', 'TED-...",2021-04-05T00:00:00.000+00:00,1617636753,Baking apple pie? Discount orange warehouse ha...,"100 solutions to reverse global warming,How we..."
1,The exploitation of US college athletes,Tim Nevius,College sports lawyer,Tim Nevius is a leading sports lawyer and coll...,Colleges and universities in the US make billi...,TEDxDayton,438573,611,"['sports', 'law', 'education', 'United States'...",2020-11-10T00:00:00.000+00:00,1617634131,"In college sports, American universities are e...","Protecting the brain against concussion,A summ..."
2,How does ultrasound work?,Jacques Abramowicz,Unknown,Unknown,"In a dark cave, bats can't see much. But even ...",TED-Ed,140874,295,"['education', 'technology', 'animation', 'TED-...",2021-04-01T00:00:00.000+00:00,1617290223,"In a pitch-black cave, bats can’t see much. Bu...",How LIGO discovered gravitational waves -- and...
3,"An honest history of an ancient and ""nasty"" word",Kate Lister,Sex historian,Kate Lister is a sex historian and lecturers a...,"With candor and cunning, sex historian Kate Li...",TEDxUniversityofGlasgow,569477,1148,"['sex', 'language', 'ancient world', 'history'...",2020-03-06T00:00:00.000+00:00,1617289565,"First, a warning. As far as offensive words go...",Beautiful new words to describe obscure emotio...
4,The electrical blueprints that orchestrate life,Michael Levin,Bioelectric explorer,Michael Levin's research could give rise to ad...,DNA isn't the only builder in the biological w...,TED2020,122682,1176,"['biology', 'science', 'invention', 'robots', ...",2020-05-18T00:00:00.000+00:00,1617213773,"Chris Anderson: Mike, welcome. It's good to se...","The next software revolution,Growing new organ..."


### Bigrams

In [36]:
tfidf_bigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(2,2))
matrix_bigrams=tfidf_bigrams.fit_transform(Text)
matrix_bigrams.shape

(4016, 1881347)

In [37]:
sim_bigram=cosine_similarity(matrix_bigrams)
sim_bigram.shape

(4016, 4016)

In [38]:
cleaned_df['most_similar_transcript_bigrams']=[get_similar_articles(x) for x in sim_bigram]

In [39]:
cleaned_df.head(1)

Unnamed: 0,talk,speaker,speaker_occ,speaker_bio,talk_desc,event,views,duration,tags,recorded_at,published on,transcript,most_similar_transcript_unigrams,most_similar_transcript_bigrams
0,Can you outsmart the apples and oranges fallacy?,Elizabeth Cox,Unknown,Unknown,It's 1997. The United States Senate has called...,TED-Ed,119359,327,"['education', 'psychology', 'animation', 'TED-...",2021-04-05T00:00:00.000+00:00,1617636753,Baking apple pie? Discount orange warehouse ha...,"100 solutions to reverse global warming,How we...",Can clouds buy us more time to solve climate c...


### Trigrams

In [40]:
tfidf_trigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(3,3))
matrix_trigrams=tfidf_trigrams.fit_transform(Text)
matrix_trigrams.shape

(4016, 2893949)

In [41]:
sim_trigram=cosine_similarity(matrix_trigrams)
sim_trigram.shape

(4016, 4016)

In [42]:
cleaned_df['most_similar_transcript_trigrams']=[get_similar_articles(x) for x in sim_trigram]

In [43]:
example_talk = cleaned_df.iloc[9,0]
example_tags = cleaned_df.iloc[9,8]
example_recommendations_unigram = cleaned_df.iloc[9,12].split(',')
example_recommendations_bigram = cleaned_df.iloc[9,13].split(',')
example_recommendations_trigram = cleaned_df.iloc[9,14].split(',')

In [44]:
print("Example : Recommendation using Transcript\n")
print("Talk name ->",example_talk)
print("Talk tags ->",example_tags)
print()
print("Recommendations Unigram : ")
for each in example_recommendations_unigram[::-1]:
    print(each)
print()
print("Recommendations Bigram : ")
for each in example_recommendations_bigram[::-1]:
    print(each)
print()
print("Recommendations Trigram : ")
for each in example_recommendations_trigram[::-1]:
    print(each)
print()

    

Example : Recommendation using Transcript

Talk name -> An election system that puts voters (not politicians) first
Talk tags -> ['United States', 'democracy', 'politics', 'society', 'TEDx', 'policy', 'government']

Recommendations Unigram : 
What's needed to bring the US voting system into the 21st century
E-voting without fraud
The fight for the right to vote in the United States
Which voting system is the best?

Recommendations Bigram : 
What's needed to bring the US voting system into the 21st century
E-voting without fraud
How to gain control of your free time
How much sleep do you really need?

Recommendations Trigram : 
What's needed to bring the US voting system into the 21st century
What makes life worth living in the face of death
What if every satellite suddenly disappeared?
The science of friction -- and its surprising impact on our lives



In [45]:
cleaned_df.head(5)

Unnamed: 0,talk,speaker,speaker_occ,speaker_bio,talk_desc,event,views,duration,tags,recorded_at,published on,transcript,most_similar_transcript_unigrams,most_similar_transcript_bigrams,most_similar_transcript_trigrams
0,Can you outsmart the apples and oranges fallacy?,Elizabeth Cox,Unknown,Unknown,It's 1997. The United States Senate has called...,TED-Ed,119359,327,"['education', 'psychology', 'animation', 'TED-...",2021-04-05T00:00:00.000+00:00,1617636753,Baking apple pie? Discount orange warehouse ha...,"100 solutions to reverse global warming,How we...",Can clouds buy us more time to solve climate c...,"Art made of the air we breathe,Swim with the g..."
1,The exploitation of US college athletes,Tim Nevius,College sports lawyer,Tim Nevius is a leading sports lawyer and coll...,Colleges and universities in the US make billi...,TEDxDayton,438573,611,"['sports', 'law', 'education', 'United States'...",2020-11-10T00:00:00.000+00:00,1617634131,"In college sports, American universities are e...","Protecting the brain against concussion,A summ...","An ultra-low-cost college degree,Our dangerous...","The boost students need to overcome obstacles,..."
2,How does ultrasound work?,Jacques Abramowicz,Unknown,Unknown,"In a dark cave, bats can't see much. But even ...",TED-Ed,140874,295,"['education', 'technology', 'animation', 'TED-...",2021-04-01T00:00:00.000+00:00,1617290223,"In a pitch-black cave, bats can’t see much. Bu...",How LIGO discovered gravitational waves -- and...,"A census of the ocean,The beautiful, mysteriou...","Color-coded surgery,A new weapon in the fight ..."
3,"An honest history of an ancient and ""nasty"" word",Kate Lister,Sex historian,Kate Lister is a sex historian and lecturers a...,"With candor and cunning, sex historian Kate Li...",TEDxUniversityofGlasgow,569477,1148,"['sex', 'language', 'ancient world', 'history'...",2020-03-06T00:00:00.000+00:00,1617289565,"First, a warning. As far as offensive words go...",Beautiful new words to describe obscure emotio...,"What makes a word ""real""?,Where did English co...","Doodlers, unite!,Where did English come from?,..."
4,The electrical blueprints that orchestrate life,Michael Levin,Bioelectric explorer,Michael Levin's research could give rise to ad...,DNA isn't the only builder in the biological w...,TED2020,122682,1176,"['biology', 'science', 'invention', 'robots', ...",2020-05-18T00:00:00.000+00:00,1617213773,"Chris Anderson: Mike, welcome. It's good to se...","The next software revolution,Growing new organ...","A computer that works like the brain,How do ca...","It's time to question bio-engineering,The unhe..."


# Recommendation using Talk Description

## Term Frequency-Inverse Document Frequency (Tf-Idf)

In [46]:
Text=cleaned_df['talk_desc'].tolist()
tfidf=text.TfidfVectorizer(input=Text,stop_words="english")
matrix=tfidf.fit_transform(Text)
print(matrix.shape)

## Cosine Similarity

### Unigrams

from sklearn.metrics.pairwise import cosine_similarity
sim_unigram=cosine_similarity(matrix)
sim_unigram.shape

#https://www.kaggle.com/code/gunnvant/building-content-recommender-tutorial
#finding the most similar 5 ted talks using the correlation matrix of words created using cosine similarity
def get_similar_articles(x):
    return ",".join((cleaned_df['talk']).loc[x.argsort()[-5:-1]])
cleaned_df['most_similar_desc_unigrams']=[get_similar_articles(x) for x in sim_unigram]

cleaned_df.head(5)

### Bigrams

tfidf_bigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(2,2))
matrix_bigrams=tfidf_bigrams.fit_transform(Text)
matrix_bigrams.shape

sim_bigram=cosine_similarity(matrix_bigrams)
sim_bigram.shape

cleaned_df['most_similar_desc_bigrams']=[get_similar_articles(x) for x in sim_bigram]

cleaned_df.head(1).most_similar_desc_unigrams

### Trigrams

tfidf_trigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(3,3))
matrix_trigrams=tfidf_trigrams.fit_transform(Text)
matrix_trigrams.shape

sim_trigram=cosine_similarity(matrix_trigrams)
sim_trigram.shape

cleaned_df['most_similar_desc_trigrams']=[get_similar_articles(x) for x in sim_trigram]

example_talk = cleaned_df.iloc[9,0]
example_tags = cleaned_df.iloc[9,8]
example_recommendations_unigram = cleaned_df.iloc[9,15].split(',')
example_recommendations_bigram = cleaned_df.iloc[9,16].split(',')
example_recommendations_trigram = cleaned_df.iloc[9,17].split(',')

print("Example : Recommendation using Talk Descritption\n")
print("Talk name ->",example_talk)
print("Talk tags ->",example_tags)
print()
print("Recommendations Unigram : ")
for each in example_recommendations_unigram[::-1]:
    print(each)
print()
print("Recommendations Bigram : ")
for each in example_recommendations_bigram[::-1]:
    print(each)
print()
print("Recommendations Trigram : ")
for each in example_recommendations_trigram[::-1]:
    print(each)
print()

    

(4016, 21448)
Example : Recommendation using Talk Descritption

Talk name -> An election system that puts voters (not politicians) first
Talk tags -> ['United States', 'democracy', 'politics', 'society', 'TEDx', 'policy', 'government']

Recommendations Unigram : 
The unexpected challenges of a country's first election
Can democracy exist without trust?
What's needed to bring the US voting system into the 21st century
Does your vote count? The Electoral College explained

Recommendations Bigram : 
The fight for the right to vote in the United States
Does racism affect how you vote?
Averting the climate crisis
Why must artists be poor?

Recommendations Trigram : 
Averting the climate crisis
"my mama" / "BLACK BANANA"
How to inspire every child to be a lifelong reader
Why we should end animal agriculture



# Recommendation using Talk title

## Term Frequency-Inverse Document Frequency (Tf-Idf)

In [47]:
Text=cleaned_df['talk'].tolist()
tfidf=text.TfidfVectorizer(input=Text,stop_words="english")
matrix=tfidf.fit_transform(Text)
print(matrix.shape)

## Cosine Similarity

### Unigrams

from sklearn.metrics.pairwise import cosine_similarity
sim_unigram=cosine_similarity(matrix)
sim_unigram.shape

#https://www.kaggle.com/code/gunnvant/building-content-recommender-tutorial
#finding the most similar 5 ted talks using the correlation matrix of words created using cosine similarity
def get_similar_articles(x):
    return ",".join((cleaned_df['talk']).loc[x.argsort()[-5:-1]])
cleaned_df['most_similar_title_unigrams']=[get_similar_articles(x) for x in sim_unigram]

cleaned_df.head(5)

### Bigrams

tfidf_bigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(2,2))
matrix_bigrams=tfidf_bigrams.fit_transform(Text)
matrix_bigrams.shape

sim_bigram=cosine_similarity(matrix_bigrams)
sim_bigram.shape

cleaned_df['most_similar_title_bigrams']=[get_similar_articles(x) for x in sim_bigram]

cleaned_df.head(1).most_similar_title_unigrams

### Trigrams

tfidf_trigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(3,3))
matrix_trigrams=tfidf_trigrams.fit_transform(Text)
matrix_trigrams.shape

sim_trigram=cosine_similarity(matrix_trigrams)
sim_trigram.shape

cleaned_df['most_similar_title_trigrams']=[get_similar_articles(x) for x in sim_trigram]


example_talk = cleaned_df.iloc[9,0]
example_tags = cleaned_df.iloc[9,8]
example_recommendations_unigram = cleaned_df.iloc[9,18].split(',')
example_recommendations_bigram = cleaned_df.iloc[9,19].split(',')
example_recommendations_trigram = cleaned_df.iloc[9,20].split(',')

print("Example : Recommendation using Talk Title\n")
print("Talk name ->",example_talk)
print("Talk tags ->",example_tags)
print()
print("Recommendations Unigram : ")
for each in example_recommendations_unigram[::-1]:
    print(each)
print()
print("Recommendations Bigram : ")
for each in example_recommendations_bigram[::-1]:
    print(each)
print()
print("Recommendations Trigram : ")
for each in example_recommendations_trigram[::-1]:
    print(each)
print()


(4016, 4900)
Example : Recommendation using Talk Title

Talk name -> An election system that puts voters (not politicians) first
Talk tags -> ['United States', 'democracy', 'politics', 'society', 'TEDx', 'policy', 'government']

Recommendations Unigram : 
A bold idea to replace politicians
Can you solve the fantasy election riddle?
How (and why) Russia hacked the US election
How the new generation of Latinx voters could change US elections

Recommendations Bigram : 
Averting the climate crisis
"my mama" / "BLACK BANANA"
How to inspire every child to be a lifelong reader
Why we should end animal agriculture

Recommendations Trigram : 
Averting the climate crisis
"my mama" / "BLACK BANANA"
How to inspire every child to be a lifelong reader
Why we should end animal agriculture

