In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go 
import datetime
import ast

In [38]:
# Import data
ted_talk = pd.read_csv('ted_main.csv')
ted_talk['film_date'] = pd.to_datetime(ted_talk['film_date'],unit = 's')
ted_talk['published_date'] = pd.to_datetime(ted_talk['film_date'],unit = 's')
ted_talk.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,2006-02-25,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,2006-02-25,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,2006-02-25,43,Al Gore,Al Gore: Averting the climate crisis,1,2006-02-25,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,2006-02-24,26,David Pogue,David Pogue: Simplicity sells,1,2006-02-24,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,2006-02-26,35,Majora Carter,Majora Carter: Greening the ghetto,1,2006-02-26,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550
4,593,You've never seen data presented like this. Wi...,1190,TED2006,2006-02-22,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,2006-02-22,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869


# EDA

### 1. Find Null value

In [39]:
# Overall Data
ted_talk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2550 entries, 0 to 2549
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   comments            2550 non-null   int64         
 1   description         2550 non-null   object        
 2   duration            2550 non-null   int64         
 3   event               2550 non-null   object        
 4   film_date           2550 non-null   datetime64[ns]
 5   languages           2550 non-null   int64         
 6   main_speaker        2550 non-null   object        
 7   name                2550 non-null   object        
 8   num_speaker         2550 non-null   int64         
 9   published_date      2550 non-null   datetime64[ns]
 10  ratings             2550 non-null   object        
 11  related_talks       2550 non-null   object        
 12  speaker_occupation  2544 non-null   object        
 13  tags                2550 non-null   object      

In [40]:
# Find Null or NA value

ted_talk.isna().sum()

comments              0
description           0
duration              0
event                 0
film_date             0
languages             0
main_speaker          0
name                  0
num_speaker           0
published_date        0
ratings               0
related_talks         0
speaker_occupation    6
tags                  0
title                 0
url                   0
views                 0
dtype: int64

The data has 2550 records, 17 columns. There is no null value in this data set

### 2. Distribution of the data

#### Comment

In [41]:
fig = px.box(ted_talk, y = 'comments', title = 'Distribution of comments', color_discrete_sequence = ['red'])
fig.update_layout(
    template= 'plotly_dark')

#### Duration

In [42]:
fig = px.box(ted_talk, y = 'duration', title = 'Distribution of duration', color_discrete_sequence = ['red'])
fig.update_layout(
    template= 'plotly_dark')

#### Event

In [43]:
count_talk = ted_talk.groupby('event')['event'].count()
fig = px.box(count_talk, y = count_talk, hover_data = [count_talk.index], title = 'Distribution of event counts', color_discrete_sequence = ['red'])
fig.update_layout(
    template= 'plotly_dark')
fig.update_yaxes(showgrid=False,visible=True,title="Event Counts")
fig.show()

Film date and Published Date

In [44]:
ted_talk[ted_talk['published_date'] != ted_talk['film_date']]

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views


Film_date equal to published_date for all records. Thus, we can use any columns to filter the year.

In [45]:
year_count = pd.DataFrame(ted_talk.groupby([ted_talk['film_date'].dt.year]).film_date.count())
year_count.columns = ['count']
year_count = year_count.reset_index()

fig = px.bar(year_count, x = 'film_date', y = 'count', color_discrete_sequence = ['red'])
fig.update_layout(
    template= 'plotly_dark')

As year 1972 - 2001 has less than 10 talks, we will exclude this year from our dashboard

In [46]:
ted_talk = ted_talk[ted_talk['film_date'].dt.year >= 2002].reset_index(drop = True)

ted_talk.shape

(2533, 17)

Find the year that does not align with event name

In [47]:
year_name = ted_talk.loc[:,['event','film_date']]
year_name['year_from_name'] = year_name['event'].apply(lambda x: x[-4:])
year_name['year_date'] = year_name['film_date'].dt.year

year_name[(year_name['year_from_name'] > year_name['year_date'].apply(lambda x: str(x))) & (year_name['year_from_name'].apply(lambda x: x.isnumeric()))]

Unnamed: 0,event,film_date,year_from_name,year_date
822,TEDSalon NY2011,2010-11-18,2011,2010


In [48]:
ted_talk[((ted_talk['event'] == 'TEDSalon NY2011') & (ted_talk['film_date'] == '2010-11-18'))]

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
822,272,"In an intimate talk, Barry Schwartz dives into...",1387,TEDSalon NY2011,2010-11-18,32,Barry Schwartz,Barry Schwartz: Using our practical wisdom,1,2010-11-18,"[{'id': 3, 'name': 'Courageous', 'count': 175}...","[{'id': 462, 'hero': 'https://pe.tedcdn.com/im...",Psychologist,"['culture', 'global issues', 'happiness', 'phi...",Using our practical wisdom,https://www.ted.com/talks/barry_schwartz_using...,948622


We will drop row that have film_date before the event date.

In [49]:
ted_talk = ted_talk[~((ted_talk['event'] == 'TEDSalon NY2011') & (ted_talk['film_date'] == '2010-11-18'))].reset_index(drop = True)

ted_talk.shape

(2532, 17)

#### Number of speakers

In [50]:
count_speaker = ted_talk.groupby(['num_speaker'])['num_speaker'].count()
fig = px.bar(count_speaker,x = count_speaker.index, y = count_speaker, title = 'Number of speakers of the talk', color_discrete_sequence = ['red'])
fig.update_layout(
    template= 'plotly_dark')
fig.update_yaxes(showgrid=False,visible=True, title="Count")
fig.update_xaxes(title = 'Number of speaker')
fig.show()

#### Rating

In [51]:
for i in range(len(ted_talk)):
    if i == 0:
        rating = pd.DataFrame(ast.literal_eval(ted_talk.ratings[i]))
    else:
        rating = rating.append(pd.DataFrame(ast.literal_eval(ted_talk.ratings[i])))

rating

Unnamed: 0,id,name,count
0,7,Funny,19645
1,1,Beautiful,4573
2,9,Ingenious,6073
3,3,Courageous,3253
4,11,Longwinded,387
...,...,...,...
9,11,Longwinded,9
10,24,Persuasive,5
11,23,Jaw-dropping,3
12,3,Courageous,0


In [52]:
rating_count = pd.DataFrame(rating.groupby(['id','name'])['count'].sum()).reset_index()

fig = px.bar(rating_count, x = 'name', y = 'count', color_discrete_sequence = ['red'])
fig.update_layout(
    template= 'plotly_dark')
fig.update_yaxes(showgrid=False,visible=True, title="Count")
fig.update_xaxes(title = 'Ratings')
fig.show()

#### Tags

In [53]:
for i in range(len(ted_talk)):
    if i == 0:
        tag = pd.DataFrame(ast.literal_eval(ted_talk.tags[i]),columns = ['tags'])
    else:
        tag = tag.append(pd.DataFrame(ast.literal_eval(ted_talk.tags[i]),columns = ['tags']))

tag

Unnamed: 0,tags
0,children
1,creativity
2,culture
3,dance
4,education
...,...
4,play
5,public spaces
6,society
7,software


In [54]:
count_tag = pd.DataFrame(tag.groupby('tags')['tags'].count().sort_values(ascending = False))
count_tag.columns = ['count']
count_tag = count_tag.reset_index()

fig = px.box(count_tag, hover_data = ['tags'], title = 'Distribution of tag counts', color_discrete_sequence = ['red'])
fig.update_layout(
    template= 'plotly_dark')
fig.update_xaxes(title = 'Tag counts')
fig.show()

In [55]:
fig = px.bar(count_tag.head(10), x = count_tag['tags'].head(10), y = count_tag['count'].head(10), color_discrete_sequence = ['red'])
fig.update_layout(
    template= 'plotly_dark')
fig.update_yaxes(showgrid=False,visible=True, title="Count")
fig.update_xaxes(title = 'Tags')
fig.show()

#### View

In [56]:
fig = px.box(ted_talk[['views','title']],hover_data = ['title'], color_discrete_sequence = ['red'])
fig.update_layout(
    template= 'plotly_dark')
fig.update_yaxes(showgrid=False,visible=True, title="Value")
fig.update_xaxes(title = 'Views')
fig.show()

In [57]:
ted_talk.groupby(ted_talk['film_date'].dt.year).views.agg(['count','mean','median']).reset_index().sort_values('film_date')

Unnamed: 0,film_date,count,mean,median
0,2002,27,961821.6,744257.0
1,2003,33,1098728.0,774492.0
2,2004,33,2552288.0,1205867.0
3,2005,66,1795745.0,1000579.0
4,2006,50,3053768.0,1026187.5
5,2007,114,1196743.0,745651.0
6,2008,84,1777958.0,1009386.0
7,2009,232,1679581.0,805672.0
8,2010,266,1447511.0,900168.5
9,2011,270,1491156.0,938716.0


# Engagement Charts

In [58]:
# Create new variables `interaction`: comments / views
ted_talk["interaction"] = round((ted_talk["comments"] / ted_talk["views"]) * 100, 2)

# Create new variable `year`
ted_talk['year'] = ted_talk['film_date'].dt.year

ted_talk.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,interaction,year
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,2006-02-25,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,2006-02-25,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110,0.01,2006
1,265,With the same humor and humanity he exuded in ...,977,TED2006,2006-02-25,43,Al Gore,Al Gore: Averting the climate crisis,1,2006-02-25,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520,0.01,2006
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,2006-02-24,26,David Pogue,David Pogue: Simplicity sells,1,2006-02-24,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292,0.01,2006
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,2006-02-26,35,Majora Carter,Majora Carter: Greening the ghetto,1,2006-02-26,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550,0.01,2006
4,593,You've never seen data presented like this. Wi...,1190,TED2006,2006-02-22,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,2006-02-22,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869,0.0,2006
