In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text
from nltk.tokenize import sent_tokenize

In [12]:
class TedTalk:
    def __init__(self):
        self.speaker_data = pd.read_csv('dataset/speaker_data.csv')
        self.talk_data = pd.read_csv('dataset/talk_data.csv')
        self.transcript_data = pd.read_csv('dataset/transcript_data.csv')
        self.preprocessed_data = self.preprocessing()
        self.themes_df = pd.DataFrame()
#         self.recommendation_df = self.recommendations()
        
    def preprocessing(self):
        # filling 648 'speaker_occ'=Nan with value 'Unknown'
        self.speaker_data['speaker_occ'] = self.speaker_data['speaker_occ'].fillna('Unknown')
        #filling 623 'speaker_bio'=Nan with value 'Unknown'
        self.speaker_data['speaker_bio'] = self.speaker_data['speaker_bio'].fillna('Unknown')
        
        self.speaker_data['speaker'] = self.speaker_data['speaker'].fillna('Unknown')
        #dropping the 'speaker_title' since majority is Nan and it does not provide any useful insight for our use case
        self.speaker_data = self.speaker_data.drop(columns=['speaker_title'])
        
        self.transcript_data = self.transcript_data.dropna(subset=['transcript'])
        
        final_df = pd.merge(self.speaker_data, self.talk_data, how='inner', left_on = 'talk', right_on = 'talk_name')
        #Dropping the duplicate column 'talk_name'
        final_df = final_df.drop(columns=['talk_name'])
        final_df = pd.merge(final_df, self.transcript_data, how='inner', left_on = 'talk', right_on = 'title')
        
        final_df = final_df.drop(columns=['title'])
        #Removing all rows which have duplicated talk name
        final_df = final_df.drop_duplicates(subset=['talk'])
        final_df = final_df.reset_index()
        
        Text=final_df['transcript'].tolist()
        tfidf=text.TfidfVectorizer(input=Text,stop_words="english")
        matrix=tfidf.fit_transform(Text)
        sim_unigram=cosine_similarity(matrix)
        def get_similar_articles(x):
            return "-,".join((final_df['talk']).loc[x.argsort()[-6:-1]])
        final_df['most_similar_transcript_unigrams']=[get_similar_articles(x) for x in sim_unigram]
#         final_df.to_csv('clean_data.csv',index=False)
        return final_df
    
    def statistics(self):
        print("\nHere are few interesting insights for you")
        self.videos_per_year_graph()
        self.get_pop_theme_graph()
        self.top_speaker()
        self.most_famous_talks()
        self.top_events()
        
        
    def videos_per_year_graph(self):
        df = self.preprocessed_data.copy(deep=True)
        # convert the recorded_at  Date to datetime
        df['recorded_at'] = pd.to_datetime(df['recorded_at'])
        # add a column for Year
        df['Year'] = df['recorded_at'].dt.year
        # print the dataframe
        df['year'] = pd.to_datetime(df['recorded_at']).dt.year
        
        fig = plt.figure() 
        fig.set_size_inches(8,6)
        df.Year.value_counts().sort_index().plot()
        plt.title("Rise in Videos Over the Years")
        plt.xlabel('Years')
        plt.ylabel('Number of Videos Released per year')
        plt.show()
        return
    
    def most_famous_talks(self):
        print("These are our top 25 videos:")
        famous_talks = self.preprocessed_data.sort_values('views', ascending=False)[:25]
        display(famous_talks.iloc[:,[1,2,5,6,7]])
        return
    
    def get_themes(self):
        df = self.preprocessed_data.copy(deep=True)
        df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x))
        s = df.apply(lambda x: pd.Series(x['tags']),axis=1).stack().reset_index(level=1, drop=True)
        s.name = 'theme'
        self.themes_df = df.join(s)
        pop_themes = pd.DataFrame(self.themes_df['theme'].value_counts()).reset_index()
        pop_themes.columns = ['theme', 'talks']
        top_10 = pop_themes[~((pop_themes['theme']=='TED-Ed') | (pop_themes['theme']=='TEDx'))].iloc[0:10,:]
        top_10 = top_10.reset_index(drop=True)
        return top_10
    
    def get_pop_theme_list(self):
        top_10 = self.get_themes()
        print('\nHere are few popular talk themes:')
        for i in range(len(top_10['theme'])):
            print('\n\t',i+1,top_10['theme'][i])
        return
    
    def get_pop_theme_graph(self):
        top_10 = self.get_themes()
        plt.figure(figsize=(15,5))
        sns.barplot(x='theme', y='talks', data=top_10.head(10))
        plt.title("Most popular themes over the years")
        plt.xlabel('Themes')
        plt.ylabel('Number of talks')
        plt.show()
        return
        
        
    def get_top_100_talk_theme(self,theme):
        print('\nHere are the top 100 talk on ',theme)
        top100 = self.themes_df[self.themes_df['theme']==theme].sort_values('views', ascending=False)[:25]
        display(top100.iloc[:,[1,2,6,7]])
        return top100.iloc[:,[1,2,6,7]]
    
    
    def top_speaker(self):
        df = self.preprocessed_data.copy(deep=True)
        df['speaker_frequency'] = df.groupby('speaker')['speaker'].transform('count')
        df['speaker_views'] = df.groupby('speaker')['views'].transform('sum')
        df['speaker_avg_views'] =  df["speaker_views"]/df["speaker_frequency"]
        df = df.sort_values('speaker_avg_views',ascending=[False])
        plt.figure(figsize=(15,5))
        sns.barplot(x='speaker', y='speaker_avg_views', data=df.head(20))
        plt.title("Most Popular Speakers")
        plt.xticks(rotation=90)
        plt.xlabel('Popular Speakers')
        plt.ylabel('Average Views')
        plt.show()
        
    def top_events(self):
        df = self.preprocessed_data.copy(deep=True)
        df['event_frequency'] = df.groupby('event')['event'].transform('count')
        df['event_views'] = df.groupby('event')['views'].transform('sum')
        df['event_avg_views'] =  df["event_views"]/df["event_frequency"]
        df = df.sort_values('event_avg_views',ascending=[False])
        plt.figure(figsize=(15,5))
        sns.barplot(x='event', y='event_avg_views', data=df.head(20))
        plt.title("Most Popular Ted Events")
        plt.xticks(rotation=90)
        plt.xlabel('Events')
        plt.ylabel('Number of Shows')
        plt.show()
        return
    
    def talk_details(self,talk_id):
#         display(self.preprocessed_data.head(1))
#         print(self.preprocessed_data['index'talk_id])
#         display(self.preprocessed_data[self.preprocessed_data['index']==talk_id])
        if talk_id not in self.preprocessed_data.index:
            print("Wrong number selected. Please try again")
            return False
        else:
            print('\033[1mTalk\033[0m : ',self.preprocessed_data['talk'][talk_id])
            print('\033[1mSpeaker\033[0m : ',self.preprocessed_data['speaker'][talk_id])
            print('\033[1mSpeaker Occupation\033[0m : ',self.preprocessed_data['speaker_occ'][talk_id])
            print('\033[1mTalk Description\033[0m : ',self.preprocessed_data['talk_desc'][talk_id])
            print('\033[1mEvent\033[0m : ',self.preprocessed_data['event'][talk_id])
            print('\033[1mViews\033[0m : ',self.preprocessed_data['views'][talk_id])
            print('\033[1mTags\033[0m : ',self.preprocessed_data['tags'][talk_id])
            print("\n\033[1mTranscript\033[0m: \n",self.preprocessed_data['transcript'][talk_id])
            print('\n\n',self.get_similar5(talk_id))
            
            
            
            
#     def recommendations(self):
#         cleaned_df = pd.read_csv('clean_data.csv')
#         Text=cleaned_df['transcript'].tolist()
#         tfidf=text.TfidfVectorizer(input=Text,stop_words="english")
#         matrix=tfidf.fit_transform(Text)
#         sim_unigram=cosine_similarity(matrix)
#         def get_similar_articles(x):
#             return ",".join((cleaned_df['talk']).loc[x.argsort()[-6:-1]])
#         cleaned_df['most_similar_transcript_unigrams']=[get_similar_articles(x) for x in sim_unigram]
#         return cleaned_df
    
    def get_similar5(self,talk_index):
        rec_df = self.preprocessed_data.copy(deep=True)
        example_talk = rec_df.iloc[talk_index,1]
        example_tags = rec_df.iloc[talk_index,9]
        example_recommendations_unigram = rec_df.iloc[talk_index,13].split('-,')
        listA = [''.join(c for c in word if c.isalpha()) for word in example_tags.split()]
        setA = set(listA)
        i =1
        print(example_recommendations_unigram)
        print("\nRecommendation using Transcript\n")
        for each in example_recommendations_unigram[::-1]:
            tagsB = rec_df['tags'][rec_df.talk==each].values
            listB = [''.join(c for c in word if c.isalpha()) for word in tagsB[0].split()]
            setB = set(listB)
            overlap = setA & setB
            universe = setA | setB
            result1 = round(float(len(overlap)) / len(setA) * 100,1)
#             result2 = round(float(len(overlap)) / len(setB) * 100,1)
#             result3 = round(float(len(overlap)) / len(universe) * 100,1)
            print(i,each)
            print('\n\tTags =>', tagsB) 
            print('\n\tMatching =>',result1,'\n\n')
            i+=1
        print()
        return

        
        
        
        

In [13]:
if __name__=="__main__":
    ttrs = TedTalk()


In [15]:
print("\n\033[1mTED TALK App\033[0m")
print("\nLets see if we can keep you enganged")
print("Here are your options:")
print("\n\t1. View overall statistics of the Ted Talks over the years")
print("\n\t2. No stats pls! I want to see some good videos")
print("\nEnter your choice : ")
x = input()
if x=='1':
    display((ttrs.preprocessed_data.talk=='depression').value_counts())
#     ttrs.statistics()

else:
    ttrs.get_pop_theme_list()
    top_10_themes = ttrs.get_themes()
    print("\nWhich one would you like to checkout. Enter number:")
    theme_choice = input()
    top100_talks_theme = ttrs.get_top_100_talk_theme(top_10_themes['theme'][int(theme_choice)-1])
    print('-->If you want to checkout any of these enter the number you see next the Ted Talk name.', 
          '\n-->If you want to explore some other theme, enter 9999')
    talk_choice = int(input())
    status = ttrs.talk_details(talk_choice)
#         ttrs.get_similar5(talk_choice)



[1mTED TALK App[0m

Lets see if we can keep you enganged
Here are your options:

	1. View overall statistics of the Ted Talks over the years

	2. No stats pls! I want to see some good videos

Enter your choice : 
2

Here are few popular talk themes:

	 1 science

	 2 technology

	 3 culture

	 4 animation

	 5 society

	 6 global issues

	 7 social change

	 8 education

	 9 design

	 10 health

Which one would you like to checkout. Enter number:
8

Here are the top 100 talk on  education


Unnamed: 0,talk,speaker,event,views
4012,Do schools kill creativity?,Sir Ken Robinson,TED2006,70176973
2657,Grit,Angela Lee Duckworth,TED Talks Education,23109126
3003,Questions no one knows the answers to,Chris Anderson,TED-Ed,22980323
1117,The language of lying,Noah Zandan,TED-Ed,15462918
2668,Every kid needs a champion,Rita F. Pierson,TED Talks Education,12550857
2223,The power of believing that you can improve,Carol Dweck,TEDxNorrkoping,12423718
3103,A 12-year-old app developer,Thomas Suarez,TEDxManhattanBeach,12177882
961,The secrets of learning a new language,Lýdia Machová,TED Salon Brightline Initiative,11639417
1993,What would happen if you didn't sleep?,Claudia Aguirre,TED-Ed,11228817
1983,"Can you solve ""Einstein's Riddle""?",Dan Vieren,TED-Ed,10910767


-->If you want to checkout any of these enter the number you see next the Ted Talk name. 
-->If you want to explore some other theme, enter 9999
2323
[1mTalk[0m :  An ultra-low-cost college degree
[1mSpeaker[0m :  Shai Reshef
[1mSpeaker Occupation[0m :  Education entrepreneur
[1mTalk Description[0m :  At the online University of the People, anyone with a high school diploma can take classes toward a degree in business administration or computer science — without standard tuition fees (though exams cost money). Founder Shai Reshef hopes that higher education is changing "from being a privilege for the few to a basic right, affordable and accessible for all."
[1mEvent[0m :  TED2014
[1mViews[0m :  6374825
[1mTags[0m :  ['business', 'computers', 'education', 'open-source', 'technology', 'code']

[1mTranscript[0m: 
 I would like to share with you a new model of higher education, a model that, once expanded, can enhance the collective intelligence of millions of creative and 