In [60]:
pip install prettytable

Collecting prettytable
  Downloading prettytable-3.3.0-py3-none-any.whl (26 kB)
Installing collected packages: prettytable
Successfully installed prettytable-3.3.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction import text
from prettytable import PrettyTable

In [2]:
cleaned_df = pd.read_csv('cleaned data/vaishak_data.csv')

In [3]:
cleaned_df.shape

(4016, 12)

In [4]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4016 entries, 0 to 4015
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   talk          4016 non-null   object
 1   speaker       4016 non-null   object
 2   speaker_occ   4016 non-null   object
 3   speaker_bio   4016 non-null   object
 4   talk_desc     4016 non-null   object
 5   event         4016 non-null   object
 6   views         4016 non-null   int64 
 7   duration      4016 non-null   int64 
 8   tags          4016 non-null   object
 9   recorded_at   4016 non-null   object
 10  published on  4016 non-null   int64 
 11  transcript    4016 non-null   object
dtypes: int64(3), object(9)
memory usage: 376.6+ KB


# Recommendation using transcript

## Term Frequency-Inverse Document Frequency (Tf-Idf)

In [232]:
Text=cleaned_df['transcript'].tolist()
tfidf=text.TfidfVectorizer(input=Text,stop_words="english")
matrix=tfidf.fit_transform(Text)
print(matrix.shape)

(4016, 68448)


## Cosine Similarity

### Unigrams

In [233]:
from sklearn.metrics.pairwise import cosine_similarity
sim_unigram=cosine_similarity(matrix)
sim_unigram.shape

(4016, 4016)

In [234]:
#https://www.kaggle.com/code/gunnvant/building-content-recommender-tutorial
#finding the most similar 5 ted talks using the correlation matrix of words created using cosine similarity
def get_similar_articles(x):
    return "-,".join((cleaned_df['talk']).loc[x.argsort()[-6:-1]])
cleaned_df['most_similar_transcript_unigrams']=[get_similar_articles(x) for x in sim_unigram]

In [235]:
cleaned_df.head(5)

Unnamed: 0,talk,speaker,speaker_occ,speaker_bio,talk_desc,event,views,duration,tags,recorded_at,published on,transcript,most_similar_transcript_unigrams
0,Can you outsmart the apples and oranges fallacy?,Elizabeth Cox,Unknown,Unknown,It's 1997. The United States Senate has called...,TED-Ed,119359,327,"['education', 'psychology', 'animation', 'TED-...",2021-04-05T00:00:00.000+00:00,1617636753,Baking apple pie? Discount orange warehouse ha...,Can clouds buy us more time to solve climate c...
1,The exploitation of US college athletes,Tim Nevius,College sports lawyer,Tim Nevius is a leading sports lawyer and coll...,Colleges and universities in the US make billi...,TEDxDayton,438573,611,"['sports', 'law', 'education', 'United States'...",2020-11-10T00:00:00.000+00:00,1617634131,"In college sports, American universities are e...","The ""opportunity gap"" in US public education -..."
2,How does ultrasound work?,Jacques Abramowicz,Unknown,Unknown,"In a dark cave, bats can't see much. But even ...",TED-Ed,140874,295,"['education', 'technology', 'animation', 'TED-...",2021-04-01T00:00:00.000+00:00,1617290223,"In a pitch-black cave, bats can’t see much. Bu...","The sonic boom problem-,How LIGO discovered gr..."
3,"An honest history of an ancient and ""nasty"" word",Kate Lister,Sex historian,Kate Lister is a sex historian and lecturers a...,"With candor and cunning, sex historian Kate Li...",TEDxUniversityofGlasgow,569477,1148,"['sex', 'language', 'ancient world', 'history'...",2020-03-06T00:00:00.000+00:00,1617289565,"First, a warning. As far as offensive words go...","Go ahead, make up new words!-,Beautiful new wo..."
4,The electrical blueprints that orchestrate life,Michael Levin,Bioelectric explorer,Michael Levin's research could give rise to ad...,DNA isn't the only builder in the biological w...,TED2020,122682,1176,"['biology', 'science', 'invention', 'robots', ...",2020-05-18T00:00:00.000+00:00,1617213773,"Chris Anderson: Mike, welcome. It's good to se...","Printing a human kidney-,The next software rev..."


In [236]:
example_talk = cleaned_df.iloc[0,0]
example_tags = cleaned_df.iloc[0,8]
example_recommendations_unigram = cleaned_df.iloc[0,12].split('-,')
print(example_recommendations_unigram)

['Can clouds buy us more time to solve climate change?', '100 solutions to reverse global warming', "How we could change the planet's climate future", 'Emergency medicine for our climate fever', 'Why I must speak out about climate change']


In [237]:
example_tags = cleaned_df.iloc[0,8]
result = [''.join(c for c in word if c.isalpha()) for word in example_tags.split(',')]
result

['education', 'psychology', 'animation', 'TEDEd', 'brain', 'climatechange']

In [238]:
from prettytable import PrettyTable

In [239]:
print("Example : Recommendation using Transcript\n")
print("Talk name(A) ->",example_talk)
print("Talk tags ->",example_tags)
print()
listA = [''.join(c for c in word if c.isalpha()) for word in example_tags.split(',')]
setA = set(listA)
print("Recommendations Unigram : ")
myTable = PrettyTable(["Talk Id.","Recommended Talks(B)", "Overlap/setA", "Overlap/setB"])
for each in example_recommendations_unigram[::-1]:
    tagsB = cleaned_df['tags'][cleaned_df.talk==each].values
    listB = [''.join(c for c in word if c.isalpha()) for word in tagsB[0].split(',')]
    setB = set(listB)
    overlap = setA & setB
    universe = setA | setB
    result1 = round(float(len(overlap)) / len(setA) * 100,1)
    result2 = round(float(len(overlap)) / len(setB) * 100,1)
    talkid= cleaned_df[cleaned_df.talk==each].index.values
    myTable.add_row([talkid,each, result1, result2])
print(myTable)

Example : Recommendation using Transcript

Talk name(A) -> Can you outsmart the apples and oranges fallacy?
Talk tags -> ['education', 'psychology', 'animation', 'TED-Ed', 'brain', 'climate change']

Recommendations Unigram : 
+----------+------------------------------------------------------+--------------+--------------+
| Talk Id. |                 Recommended Talks(B)                 | Overlap/setA | Overlap/setB |
+----------+------------------------------------------------------+--------------+--------------+
|  [3009]  |      Why I must speak out about climate change       |     16.7     |     20.0     |
|  [679]   |       Emergency medicine for our climate fever       |     16.7     |     11.1     |
|  [414]   |   How we could change the planet's climate future    |     16.7     |     12.5     |
|  [1001]  |       100 solutions to reverse global warming        |     33.3     |     13.3     |
|  [1559]  | Can clouds buy us more time to solve climate change? |     16.7     |     

In [240]:
cleaned_df[cleaned_df.talk=="Depression, the secret we share"]

Unnamed: 0,talk,speaker,speaker_occ,speaker_bio,talk_desc,event,views,duration,tags,recorded_at,published on,transcript,most_similar_transcript_unigrams
2486,"Depression, the secret we share",Andrew Solomon,Writer,"Andrew Solomon writes about politics, culture ...","""The opposite of depression is not happiness, ...",TEDxMet,11733589,1761,"['TEDx', 'culture', 'depression', 'medicine', ...",2013-10-23T00:00:00.000+00:00,1387382509,"""I felt a Funeral, in my Brain, and Mourners t...","How do antidepressants work?-,Don't suffer fro..."


### Bigrams

In [241]:
tfidf_bigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(2,2))
matrix_bigrams=tfidf_bigrams.fit_transform(Text)
matrix_bigrams.shape

(4016, 1881347)

In [242]:
sim_bigram=cosine_similarity(matrix_bigrams)
sim_bigram.shape

(4016, 4016)

In [243]:
cleaned_df['most_similar_transcript_bigrams']=[get_similar_articles(x) for x in sim_bigram]

In [244]:
cleaned_df.head(1)

Unnamed: 0,talk,speaker,speaker_occ,speaker_bio,talk_desc,event,views,duration,tags,recorded_at,published on,transcript,most_similar_transcript_unigrams,most_similar_transcript_bigrams
0,Can you outsmart the apples and oranges fallacy?,Elizabeth Cox,Unknown,Unknown,It's 1997. The United States Senate has called...,TED-Ed,119359,327,"['education', 'psychology', 'animation', 'TED-...",2021-04-05T00:00:00.000+00:00,1617636753,Baking apple pie? Discount orange warehouse ha...,Can clouds buy us more time to solve climate c...,"The case for optimism-,Can clouds buy us more ..."


### Trigrams

In [245]:
tfidf_trigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(3,3))
matrix_trigrams=tfidf_trigrams.fit_transform(Text)
matrix_trigrams.shape

(4016, 2893949)

In [246]:
sim_trigram=cosine_similarity(matrix_trigrams)
sim_trigram.shape

(4016, 4016)

In [247]:
cleaned_df['most_similar_transcript_trigrams']=[get_similar_articles(x) for x in sim_trigram]

In [248]:
example_talk = cleaned_df.iloc[9,0]
example_tags = cleaned_df.iloc[9,8]
example_recommendations_unigram = cleaned_df.iloc[9,12].split('-,')
example_recommendations_bigram = cleaned_df.iloc[9,13].split('-,')
example_recommendations_trigram = cleaned_df.iloc[9,14].split('-,')

In [249]:
display(cleaned_df.iloc[9,:])

talk                                An election system that puts voters (not polit...
speaker                                                              Amber McReynolds
speaker_occ                              Voting rights expert, democracy entrepreneur
speaker_bio                         Amber McReynolds's vision is to create electio...
talk_desc                           From hours-long lines and limited polling loca...
event                                                                    TEDxMileHigh
views                                                                          648341
duration                                                                          619
tags                                ['United States', 'democracy', 'politics', 'so...
recorded_at                                             2020-08-29T00:00:00.000+00:00
published on                                                               1616685527
transcript                          Voting can be hard

In [250]:
example_recommendations_bigram

['It’s our city. Let’s fix it',
 'How much sleep do you really need?',
 'How to gain control of your free time',
 'E-voting without fraud',
 "What's needed to bring the US voting system into the 21st century"]

In [251]:
print("Example : Recommendation using Transcript\n")
print("Talk name(A) ->",example_talk)
print("Talk tags ->",example_tags)
print()
listA = [''.join(c for c in word if c.isalpha()) for word in example_tags.split(',')]
setA = set(listA)
print("Recommendations Unigram : ")
myTable = PrettyTable(["Recommended Talks(B)", "(tagsA & tagsB)/tagsA", "(tagsA & tagsB)/tagsB"])
for each in example_recommendations_unigram[::-1]:
    tagsB = cleaned_df['tags'][cleaned_df.talk==each].values
    listB = [''.join(c for c in word if c.isalpha()) for word in tagsB[0].split(',')]
    setB = set(listB)
    overlap = setA & setB
    universe = setA | setB
    result1 = round(float(len(overlap)) / len(setA) * 100,1)
    result2 = round(float(len(overlap)) / len(setB) * 100,1)
    myTable.add_row([each, result1, result2])
print(myTable)
print()
print("Recommendations Bigram : ")
myTable = PrettyTable(["Recommended Talks(B)", "(tagsA & tagsB)/tagsA", "(tagsA & tagsB)/tagsB"])
for each in example_recommendations_bigram[::-1]:
    tagsB = cleaned_df['tags'][cleaned_df.talk==each].values
    listB = [''.join(c for c in word if c.isalpha()) for word in tagsB[0].split(',')]
    setB = set(listB)
    overlap = setA & setB
    universe = setA | setB
    result1 = round(float(len(overlap)) / len(setA) * 100,1)
    result2 = round(float(len(overlap)) / len(setB) * 100,1)
    myTable.add_row([each, result1, result2])
print(myTable)
print()
print("Recommendations Trigram : ")
myTable = PrettyTable(["Recommended Talks(B)", "(tagsA & tagsB)/tagsA", "(tagsA & tagsB)/tagsB"])
for each in example_recommendations_trigram[::-1]:
    tagsB = cleaned_df['tags'][cleaned_df.talk==each].values
    listB = [''.join(c for c in word if c.isalpha()) for word in tagsB[0].split(',')]
    setB = set(listB)
    overlap = setA & setB
    universe = setA | setB
    result1 = round(float(len(overlap)) / len(setA) * 100,1)
    result2 = round(float(len(overlap)) / len(setB) * 100,1)
    myTable.add_row([each, result1, result2])
print(myTable)
print()



    

Example : Recommendation using Transcript

Talk name(A) -> An election system that puts voters (not politicians) first
Talk tags -> ['United States', 'democracy', 'politics', 'society', 'TEDx', 'policy', 'government']

Recommendations Unigram : 
+-------------------------------------------------------------------+-----------------------+-----------------------+
|                        Recommended Talks(B)                       | (tagsA & tagsB)/tagsA | (tagsA & tagsB)/tagsB |
+-------------------------------------------------------------------+-----------------------+-----------------------+
| What's needed to bring the US voting system into the 21st century |          71.4         |          62.5         |
|                       E-voting without fraud                      |          42.9         |          42.9         |
|        The fight for the right to vote in the United States       |          42.9         |          37.5         |
|                  Which voting system is the 

In [252]:
cleaned_df[cleaned_df.talk=="The genius of the London Tube Map,3 myths about the future of work (and why they're not true),Why we should end animal agriculture,What we can do about the culture of hate"]

Unnamed: 0,talk,speaker,speaker_occ,speaker_bio,talk_desc,event,views,duration,tags,recorded_at,published on,transcript,most_similar_transcript_unigrams,most_similar_transcript_bigrams,most_similar_transcript_trigrams


# Recommendation using Talk Description

## Term Frequency-Inverse Document Frequency (Tf-Idf)

In [255]:
Text=cleaned_df['talk_desc'].tolist()
tfidf=text.TfidfVectorizer(input=Text,stop_words="english")
matrix=tfidf.fit_transform(Text)
print(matrix.shape)

## Cosine Similarity

### Unigrams

from sklearn.metrics.pairwise import cosine_similarity
sim_unigram=cosine_similarity(matrix)
sim_unigram.shape

#https://www.kaggle.com/code/gunnvant/building-content-recommender-tutorial
#finding the most similar 5 ted talks using the correlation matrix of words created using cosine similarity
def get_similar_articles(x):
    return "-,".join((cleaned_df['talk']).loc[x.argsort()[-6:-1]])
cleaned_df['most_similar_desc_unigrams']=[get_similar_articles(x) for x in sim_unigram]

cleaned_df.head(5)

### Bigrams

tfidf_bigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(2,2))
matrix_bigrams=tfidf_bigrams.fit_transform(Text)
matrix_bigrams.shape

sim_bigram=cosine_similarity(matrix_bigrams)
sim_bigram.shape

cleaned_df['most_similar_desc_bigrams']=[get_similar_articles(x) for x in sim_bigram]

cleaned_df.head(1).most_similar_desc_unigrams

### Trigrams

tfidf_trigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(3,3))
matrix_trigrams=tfidf_trigrams.fit_transform(Text)
matrix_trigrams.shape

sim_trigram=cosine_similarity(matrix_trigrams)
sim_trigram.shape

cleaned_df['most_similar_desc_trigrams']=[get_similar_articles(x) for x in sim_trigram]

example_talk = cleaned_df.iloc[9,0]
example_tags = cleaned_df.iloc[9,8]
example_recommendations_unigram = cleaned_df.iloc[9,15].split('-,')
example_recommendations_bigram = cleaned_df.iloc[9,16].split('-,')
example_recommendations_trigram = cleaned_df.iloc[9,17].split('-,')

print("Example : Recommendation using Talk Descritption\n")
print("Talk name(A) ->",example_talk)
print("Talk tags ->",example_tags)
print()
listA = [''.join(c for c in word if c.isalpha()) for word in example_tags.split(',')]
setA = set(listA)
print("Recommendations Unigram : ")
myTable = PrettyTable(["Recommended Talks(B)", "(tagsA & tagsB)/tagsA", "(tagsA & tagsB)/tagsB"])
for each in example_recommendations_unigram[::-1]:
    tagsB = cleaned_df['tags'][cleaned_df.talk==each].values
    listB = [''.join(c for c in word if c.isalpha()) for word in tagsB[0].split(',')]
    setB = set(listB)
    overlap = setA & setB
    universe = setA | setB
    result1 = round(float(len(overlap)) / len(setA) * 100,1)
    result2 = round(float(len(overlap)) / len(setB) * 100,1)
    myTable.add_row([each, result1, result2])
print(myTable)
print()
print("Recommendations Bigram : ")
myTable = PrettyTable(["Recommended Talks(B)", "(tagsA & tagsB)/tagsA", "(tagsA & tagsB)/tagsB"])
for each in example_recommendations_bigram[::-1]:
    tagsB = cleaned_df['tags'][cleaned_df.talk==each].values
    listB = [''.join(c for c in word if c.isalpha()) for word in tagsB[0].split(',')]
    setB = set(listB)
    overlap = setA & setB
    universe = setA | setB
    result1 = round(float(len(overlap)) / len(setA) * 100,1)
    result2 = round(float(len(overlap)) / len(setB) * 100,1)
    myTable.add_row([each, result1, result2])
print(myTable)
print()
print("Recommendations Trigram : ")
myTable = PrettyTable(["Recommended Talks(B)", "(tagsA & tagsB)/tagsA", "(tagsA & tagsB)/tagsB"])
for each in example_recommendations_trigram[::-1]:
    tagsB = cleaned_df['tags'][cleaned_df.talk==each].values
    listB = [''.join(c for c in word if c.isalpha()) for word in tagsB[0].split(',')]
    setB = set(listB)
    overlap = setA & setB
    universe = setA | setB
    result1 = round(float(len(overlap)) / len(setA) * 100,1)
    result2 = round(float(len(overlap)) / len(setB) * 100,1)
    myTable.add_row([each, result1, result2])
print(myTable)
print()

    

(4016, 21448)
Example : Recommendation using Talk Descritption

Talk name(A) -> An election system that puts voters (not politicians) first
Talk tags -> ['United States', 'democracy', 'politics', 'society', 'TEDx', 'policy', 'government']

Recommendations Unigram : 
+-------------------------------------------------------------------+-----------------------+-----------------------+
|                        Recommended Talks(B)                       | (tagsA & tagsB)/tagsA | (tagsA & tagsB)/tagsB |
+-------------------------------------------------------------------+-----------------------+-----------------------+
|      The unexpected challenges of a country's first election      |          14.3         |          14.3         |
|                 Can democracy exist without trust?                |          28.6         |          28.6         |
| What's needed to bring the US voting system into the 21st century |          71.4         |          62.5         |
|       Does your vote co

# Recommendation using Talk title

## Term Frequency-Inverse Document Frequency (Tf-Idf)

In [256]:
Text=cleaned_df['talk'].tolist()
tfidf=text.TfidfVectorizer(input=Text,stop_words="english")
matrix=tfidf.fit_transform(Text)
print(matrix.shape)

## Cosine Similarity

### Unigrams

from sklearn.metrics.pairwise import cosine_similarity
sim_unigram=cosine_similarity(matrix)
sim_unigram.shape

#https://www.kaggle.com/code/gunnvant/building-content-recommender-tutorial
#finding the most similar 5 ted talks using the correlation matrix of words created using cosine similarity
def get_similar_articles(x):
    return "-,".join((cleaned_df['talk']).loc[x.argsort()[-6:-1]])
cleaned_df['most_similar_title_unigrams']=[get_similar_articles(x) for x in sim_unigram]

cleaned_df.head(5)

### Bigrams

tfidf_bigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(2,2))
matrix_bigrams=tfidf_bigrams.fit_transform(Text)
matrix_bigrams.shape

sim_bigram=cosine_similarity(matrix_bigrams)
sim_bigram.shape

cleaned_df['most_similar_title_bigrams']=[get_similar_articles(x) for x in sim_bigram]

cleaned_df.head(1).most_similar_title_unigrams

### Trigrams

tfidf_trigrams=text.TfidfVectorizer(input=Text,stop_words="english",ngram_range=(3,3))
matrix_trigrams=tfidf_trigrams.fit_transform(Text)
matrix_trigrams.shape

sim_trigram=cosine_similarity(matrix_trigrams)
sim_trigram.shape

cleaned_df['most_similar_title_trigrams']=[get_similar_articles(x) for x in sim_trigram]


example_talk = cleaned_df.iloc[9,0]
example_tags = cleaned_df.iloc[9,8]
example_recommendations_unigram = cleaned_df.iloc[9,18].split('-,')
example_recommendations_bigram = cleaned_df.iloc[9,19].split('-,')
example_recommendations_trigram = cleaned_df.iloc[9,20].split('-,')

print("Example : Recommendation using Talk Title\n")
print("Talk name(A) ->",example_talk)
print("Talk tags ->",example_tags)
print()
listA = [''.join(c for c in word if c.isalpha()) for word in example_tags.split(',')]
setA = set(listA)
print("Recommendations Unigram : ")
myTable = PrettyTable(["Recommended Talks(B)","(tagsA & tagsB)/tagsA", "(tagsA & tagsB)/tagsB"])
for each in example_recommendations_unigram[::-1]:
    tagsB = cleaned_df['tags'][cleaned_df.talk==each].values
    listB = [''.join(c for c in word if c.isalpha()) for word in tagsB[0].split(',')]
    setB = set(listB)
    overlap = setA & setB
    universe = setA | setB
    result1 = round(float(len(overlap)) / len(setA) * 100,1)
    result2 = round(float(len(overlap)) / len(setB) * 100,1)
    myTable.add_row([each, result1, result2])
print(myTable)
print()
print("Recommendations Bigram : ")
myTable = PrettyTable(["Recommended Talks(B)","(tagsA & tagsB)/tagsA", "(tagsA & tagsB)/tagsB"])
for each in example_recommendations_bigram[::-1]:
    tagsB = cleaned_df['tags'][cleaned_df.talk==each].values
    listB = [''.join(c for c in word if c.isalpha()) for word in tagsB[0].split(',')]
    setB = set(listB)
    overlap = setA & setB
    universe = setA | setB
    result1 = round(float(len(overlap)) / len(setA) * 100,1)
    result2 = round(float(len(overlap)) / len(setB) * 100,1)
    myTable.add_row([each, result1, result2])
print(myTable)
print()
print("Recommendations Trigram : ")
myTable = PrettyTable(["Recommended Talks(B)", "Overlap/setA", "Overlap/setB"])
for each in example_recommendations_trigram[::-1]:
    tagsB = cleaned_df['tags'][cleaned_df.talk==each].values
    listB = [''.join(c for c in word if c.isalpha()) for word in tagsB[0].split(',')]
    setB = set(listB)
    overlap = setA & setB
    universe = setA | setB
    result1 = round(float(len(overlap)) / len(setA) * 100,1)
    result2 = round(float(len(overlap)) / len(setB) * 100,1)
    myTable.add_row([each, result1, result2])
print(myTable)
print()

    

(4016, 4900)
Example : Recommendation using Talk Title

Talk name(A) -> An election system that puts voters (not politicians) first
Talk tags -> ['United States', 'democracy', 'politics', 'society', 'TEDx', 'policy', 'government']

Recommendations Unigram : 
+-------------------------------------------------------------------+-----------------------+-----------------------+
|                        Recommended Talks(B)                       | (tagsA & tagsB)/tagsA | (tagsA & tagsB)/tagsB |
+-------------------------------------------------------------------+-----------------------+-----------------------+
|                 A bold idea to replace politicians                |          71.4         |          38.5         |
|             Can you solve the fantasy election riddle?            |          14.3         |          16.7         |
|            How (and why) Russia hacked the US election            |          57.1         |          23.5         |
| How the new generation of Latin