In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
speaker_data = pd.read_csv('dataset/speaker_data.csv')
talk_data = pd.read_csv('dataset/talk_data.csv')
transcript_data = pd.read_csv('dataset/transcript_data.csv')

In [3]:
speaker_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4442 entries, 0 to 4441
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   talk           4301 non-null   object
 1   speaker        4297 non-null   object
 2   speaker_title  9 non-null      object
 3   speaker_occ    3653 non-null   object
 4   speaker_bio    3678 non-null   object
dtypes: object(5)
memory usage: 173.6+ KB


In [4]:
speaker_data.shape

(4442, 5)

In [5]:
talk_data.shape

(4322, 8)

In [6]:
transcript_data.shape

(4442, 2)

In [7]:
speaker_data.isnull().sum()

talk              141
speaker           145
speaker_title    4433
speaker_occ       789
speaker_bio       764
dtype: int64

In [8]:
speaker_data = speaker_data.dropna(subset=['talk'])

In [9]:
speaker_data.isnull().sum()

talk                0
speaker             4
speaker_title    4292
speaker_occ       648
speaker_bio       623
dtype: int64

In [10]:
# filling 648 'speaker_occ'=Nan with value 'Unknown'
speaker_data['speaker_occ'] = speaker_data['speaker_occ'].fillna('Unknown')

In [11]:
speaker_data.isnull().sum()

talk                0
speaker             4
speaker_title    4292
speaker_occ         0
speaker_bio       623
dtype: int64

In [12]:
#filling 623 'speaker_bio'=Nan with value 'Unknown'
speaker_data['speaker_bio'] = speaker_data['speaker_bio'].fillna('Unknown')

In [13]:
speaker_data.isnull().sum()

talk                0
speaker             4
speaker_title    4292
speaker_occ         0
speaker_bio         0
dtype: int64

In [14]:
speaker_data['speaker'] = speaker_data['speaker'].fillna('Unknown')

In [15]:
speaker_data.isnull().sum()

talk                0
speaker             0
speaker_title    4292
speaker_occ         0
speaker_bio         0
dtype: int64

In [16]:
#dropping the 'speaker_title' since majority is Nan and it does not provide any useful insight for our use case
speaker_data = speaker_data.drop(columns=['speaker_title'])

In [17]:
speaker_data

Unnamed: 0,talk,speaker,speaker_occ,speaker_bio
0,Can you outsmart the apples and oranges fallacy?,Elizabeth Cox,Unknown,Unknown
1,The exploitation of US college athletes,Tim Nevius,College sports lawyer,Tim Nevius is a leading sports lawyer and coll...
2,How does ultrasound work?,Jacques Abramowicz,Unknown,Unknown
3,"An honest history of an ancient and ""nasty"" word",Kate Lister,Sex historian,Kate Lister is a sex historian and lecturers a...
4,The electrical blueprints that orchestrate life,Michael Levin,Bioelectric explorer,Michael Levin's research could give rise to ad...
...,...,...,...,...
4437,The best stats you've ever seen,Hans Rosling,Global health expert; data visionary,"In Hans Rosling’s hands, data sings. Global tr..."
4438,Do schools kill creativity?,Sir Ken Robinson,"Author, educator",Creativity expert Sir Ken Robinson challenged ...
4439,Greening the ghetto,Majora Carter,Activist for environmental justice,Majora Carter redefined the field of environme...
4440,Simplicity sells,David Pogue,Technology columnist,David Pogue is the personal technology columni...


In [18]:
talk_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4322 entries, 0 to 4321
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   talk_desc     4322 non-null   object
 1   event         4322 non-null   object
 2   talk_name     4322 non-null   object
 3   views         4322 non-null   int64 
 4   duration      4322 non-null   int64 
 5   tags          4322 non-null   object
 6   recorded_at   4322 non-null   object
 7   published on  4322 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 270.2+ KB


In [19]:
talk_data.isnull().sum()

talk_desc       0
event           0
talk_name       0
views           0
duration        0
tags            0
recorded_at     0
published on    0
dtype: int64

In [20]:
transcript_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4442 entries, 0 to 4441
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       4442 non-null   object
 1   transcript  4298 non-null   object
dtypes: object(2)
memory usage: 69.5+ KB


In [21]:
transcript_data.isnull().sum()

title           0
transcript    144
dtype: int64

In [22]:
transcript_data = transcript_data.dropna(subset=['transcript'])

In [23]:
transcript_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4298 entries, 0 to 4441
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       4298 non-null   object
 1   transcript  4298 non-null   object
dtypes: object(2)
memory usage: 100.7+ KB


In [24]:
speaker_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4301 entries, 0 to 4441
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   talk         4301 non-null   object
 1   speaker      4301 non-null   object
 2   speaker_occ  4301 non-null   object
 3   speaker_bio  4301 non-null   object
dtypes: object(4)
memory usage: 168.0+ KB


In [25]:
talk_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4322 entries, 0 to 4321
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   talk_desc     4322 non-null   object
 1   event         4322 non-null   object
 2   talk_name     4322 non-null   object
 3   views         4322 non-null   int64 
 4   duration      4322 non-null   int64 
 5   tags          4322 non-null   object
 6   recorded_at   4322 non-null   object
 7   published on  4322 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 270.2+ KB


In [26]:
final_df = pd.merge(speaker_data, talk_data, how='inner', left_on = 'talk', right_on = 'talk_name')

In [27]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4418 entries, 0 to 4417
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   talk          4418 non-null   object
 1   speaker       4418 non-null   object
 2   speaker_occ   4418 non-null   object
 3   speaker_bio   4418 non-null   object
 4   talk_desc     4418 non-null   object
 5   event         4418 non-null   object
 6   talk_name     4418 non-null   object
 7   views         4418 non-null   int64 
 8   duration      4418 non-null   int64 
 9   tags          4418 non-null   object
 10  recorded_at   4418 non-null   object
 11  published on  4418 non-null   int64 
dtypes: int64(3), object(9)
memory usage: 448.7+ KB


In [28]:
#Dropping the duplicate column 'talk_name'
final_df = final_df.drop(columns=['talk_name'])

In [29]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4418 entries, 0 to 4417
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   talk          4418 non-null   object
 1   speaker       4418 non-null   object
 2   speaker_occ   4418 non-null   object
 3   speaker_bio   4418 non-null   object
 4   talk_desc     4418 non-null   object
 5   event         4418 non-null   object
 6   views         4418 non-null   int64 
 7   duration      4418 non-null   int64 
 8   tags          4418 non-null   object
 9   recorded_at   4418 non-null   object
 10  published on  4418 non-null   int64 
dtypes: int64(3), object(8)
memory usage: 414.2+ KB


In [30]:
final_df = pd.merge(final_df, transcript_data, how='inner', left_on = 'talk', right_on = 'title')

In [31]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7167 entries, 0 to 7166
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   talk          7167 non-null   object
 1   speaker       7167 non-null   object
 2   speaker_occ   7167 non-null   object
 3   speaker_bio   7167 non-null   object
 4   talk_desc     7167 non-null   object
 5   event         7167 non-null   object
 6   views         7167 non-null   int64 
 7   duration      7167 non-null   int64 
 8   tags          7167 non-null   object
 9   recorded_at   7167 non-null   object
 10  published on  7167 non-null   int64 
 11  title         7167 non-null   object
 12  transcript    7167 non-null   object
dtypes: int64(3), object(10)
memory usage: 783.9+ KB


In [32]:
final_df = final_df.drop(columns=['title'])

In [33]:
#Removing all rows which have duplicated talk name
final_df = final_df.drop_duplicates(subset=['talk'])

In [34]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4016 entries, 0 to 7166
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   talk          4016 non-null   object
 1   speaker       4016 non-null   object
 2   speaker_occ   4016 non-null   object
 3   speaker_bio   4016 non-null   object
 4   talk_desc     4016 non-null   object
 5   event         4016 non-null   object
 6   views         4016 non-null   int64 
 7   duration      4016 non-null   int64 
 8   tags          4016 non-null   object
 9   recorded_at   4016 non-null   object
 10  published on  4016 non-null   int64 
 11  transcript    4016 non-null   object
dtypes: int64(3), object(9)
memory usage: 407.9+ KB


In [35]:
final_df.speaker.value_counts()

Alex Gendler                     35
Iseult Gillespie                 27
Daniel Finkel                    12
Elizabeth Cox                    11
Emma Bryce                       10
                                 ..
Christian Rudder                  1
Alex Kipman                       1
Martine Rothblatt                 1
Paul Snelgrove                    1
Elizabeth Camarillo Gutierrez     1
Name: speaker, Length: 3292, dtype: int64

In [None]:
final_df.to_csv('cleaned data/vaishak_data.csv',index=False)

In [None]:
final_df.iloc[3][8]