In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast

In [2]:
class TedTalk:
    def __init__(self):
        self.speaker_data = pd.read_csv('dataset/speaker_data.csv')
        self.talk_data = pd.read_csv('dataset/talk_data.csv')
        self.transcript_data = pd.read_csv('dataset/transcript_data.csv')
        self.preprocessed_data = self.preprocessing()
        self.themes_df = pd.DataFrame()
        
    def preprocessing(self):
        # filling 648 'speaker_occ'=Nan with value 'Unknown'
        self.speaker_data['speaker_occ'] = self.speaker_data['speaker_occ'].fillna('Unknown')
        #filling 623 'speaker_bio'=Nan with value 'Unknown'
        self.speaker_data['speaker_bio'] = self.speaker_data['speaker_bio'].fillna('Unknown')
        
        self.speaker_data['speaker'] = self.speaker_data['speaker'].fillna('Unknown')
        #dropping the 'speaker_title' since majority is Nan and it does not provide any useful insight for our use case
        self.speaker_data = self.speaker_data.drop(columns=['speaker_title'])
        
        self.transcript_data = self.transcript_data.dropna(subset=['transcript'])
        
        final_df = pd.merge(self.speaker_data, self.talk_data, how='inner', left_on = 'talk', right_on = 'talk_name')
        #Dropping the duplicate column 'talk_name'
        final_df = final_df.drop(columns=['talk_name'])
        final_df = pd.merge(final_df, self.transcript_data, how='inner', left_on = 'talk', right_on = 'title')
        
        final_df = final_df.drop(columns=['title'])
        #Removing all rows which have duplicated talk name
        final_df = final_df.drop_duplicates(subset=['talk'])
        return final_df
    
    def statistics(self):
        print("\nHere are few interesting insights for you")
        self.videos_per_year_graph()
        self.most_famous_talks()
        
        
    def videos_per_year_graph(self):
        df = self.preprocessed_data.copy(deep=True)
        # convert the recorded_at  Date to datetime
        df['recorded_at'] = pd.to_datetime(df['recorded_at'])
        # add a column for Year
        df['Year'] = df['recorded_at'].dt.year
        # print the dataframe
        df['year'] = pd.to_datetime(df['recorded_at']).dt.year
        
        fig = plt.figure() 
        fig.set_size_inches(8,6)
        df.Year.value_counts().sort_index().plot()
        plt.xlabel('Years')
        plt.ylabel('Number of Videos Released per year')
        plt.show()
        return
    
    def most_famous_talks(self):
        print("These are our top 25 videos:")
        famous_talks = self.preprocessed_data.sort_values('views', ascending=False)[:25]
        display(famous_talks.iloc[:,[0,1,2,5,6]])
        return
    
    def get_themes(self):
        df = self.preprocessed_data.copy(deep=True)
        df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x))
        s = df.apply(lambda x: pd.Series(x['tags']),axis=1).stack().reset_index(level=1, drop=True)
        s.name = 'theme'
        self.themes_df = df.join(s)
        pop_themes = pd.DataFrame(self.themes_df['theme'].value_counts()).reset_index()
        pop_themes.columns = ['theme', 'talks']
        top_10 = pop_themes[~((pop_themes['theme']=='TED-Ed') | (pop_themes['theme']=='TEDx'))].iloc[0:10,0]
        top_10 = top_10.reset_index(drop=True)
        print('\nHere are few popular talk themes:')
        for i in range(len(top_10)):
            print('\n\t',i+1,top_10[i])
        return top_10
    
    def get_top_100_talk_theme(self,theme):
        print('\nHere are the top 100 talk on ',theme)
        top100 = self.themes_df[self.themes_df['theme']==theme].sort_values('views', ascending=False)[:25]
        display(top100.iloc[:,[0,1,5,6]])
        return top100.iloc[:,[0,1,5,6]]

        
        
        

In [4]:
if __name__=="__main__":
    ttrs = TedTalk()
    print("\033[1mTED TALK App\033[0m")
    print("\nLets see if we can keep you enganged")
    print("Here are your options:")
    print("\n\t1. View overall statistics of the Ted Talks over the years")
    print("\n\t2. No stats pls! I want to see some good videos")
    print("\nEnter your choice : ")
    x = input()
    if x=='1':
        ttrs.statistics()
    else:
        top_10_themes = ttrs.get_themes()
        print("\nWhich one would you like to checkout. Enter number:")
        theme_choice = input()
        top100_talks_theme = ttrs.get_top_100_talk_theme(top_10_themes[int(theme_choice)-1])
        
        

[1mTED TALK App[0m

Lets see if we can keep you enganged
Here are your options:

	1. View overall statistics of the Ted Talks over the years

	2. No stats pls! I want to see some good videos

Enter your choice : 
2

Here are few popular talk themes:

	 1 science

	 2 technology

	 3 culture

	 4 animation

	 5 society

	 6 global issues

	 7 social change

	 8 education

	 9 design

	 10 health

Which one would you like to checkout. Enter number:
1

Here are the top 100 talk on  science


Unnamed: 0,talk,speaker,event,views
5984,Your body language may shape who you are,Amy Cuddy,TEDGlobal 2012,61030600
6775,10 things you didn't know about orgasm,Mary Roach,TED2009,33320592
6732,The puzzle of motivation,Dan Pink,TEDGlobal 2009,27053706
6192,The happy secret to better work,Shawn Achor,TEDxBloomington,23519204
1634,The future we're building -- and boring,Elon Musk,TED2017,23439607
6154,Questions no one knows the answers to,Chris Anderson,TED-Ed,22980323
2260,What are those floaty things in your eye?,Michael Mauser,TED-Ed,19951792
7143,The surprising science of happiness,Dan Gilbert,TED2004,19503481
1966,A simple way to break a bad habit,Judson Brewer,TEDMED 2015,16643344
7020,Underwater astonishments,David Gallo,TED2007,16196392
