import libraries and dataset

In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from datetime import datetime

In [31]:
file_path = r"C:\Users\Tanishq\Documents\stuttgart\Study\winter 25\DS and AI for industry work1\project\data_science_project\ted_main_v2.csv\ted_main_v2.csv"

In [32]:
import os
print(os.path.exists(file_path))

True


In [33]:
df = pd.read_csv(file_path)

In [34]:
df.head(5)

Unnamed: 0,speaker_name,title,posted_date,duration,Link,about_speaker,about_talk,views,tags
0,Alex Gendler,The Egyptian myth of the death of Osiris,Jul 2020,3:56,https://www.ted.com/talks/alex_gendler_the_egy...,,"Long jealous of his older brother Osiris, the ...",208703,"education,ancient world,TED-Ed"
1,Shari Davis,What if you could help decide how the governme...,Jul 2020,10:28,https://www.ted.com/talks/shari_davis_what_if_...,As a leader of the Participatory Budgeting Pro...,What if you could help decide how the governme...,425688,"democracy,leadership,community"
2,Nita Mosby Tyler,Want a more just world? Be an unlikely ally,Jul 2020,10:15,https://www.ted.com/talks/nita_mosby_tyler_wan...,Nita Mosby Tyler specializes in the developmen...,A more equal world starts with you. Citing a f...,460269,"activism,inequality,race"
3,Susan Lupack,The race to decode a mysterious language,Jul 2020,4:24,https://www.ted.com/talks/susan_lupack_the_rac...,,"In the early 1900s, archaeologist Sir Arthur E...",350202,"TED-Ed,education,language"
4,Ariel Waldman,The colorful critter world of microbes in Anta...,Jul 2020,5:56,https://www.ted.com/talks/ariel_waldman_the_co...,"An artist who's pivoted to science, Ariel Wald...","In this tour of the microscopic world, explore...",333482,"science,animals,exploration"


In [35]:
print("Raw Data Preview:")
print(df[['views', 'duration']].head(10))


Raw Data Preview:
     views duration
0  208,703     3:56
1  425,688    10:28
2  460,269    10:15
3  350,202     4:24
4  333,482     5:56
5   50,273    14:51
6  169,272     5:10
7  575,657    10:58
8      NaN    26:32
9  262,067     4:54


In [36]:
# Clean 'views' column
df['views'] = df['views'].replace(['', 'N/A', 'None'], np.nan)  # Replace invalid entries with NaN
df['views'] = df['views'].astype(str).str.replace(r',', '', regex=True)  # Remove commas
df['views'] = pd.to_numeric(df['views'], errors='coerce')  # Convert to numeric

# Clean 'duration' column
def convert_duration_to_seconds(duration):
    try:
        minutes, seconds = map(int, duration.split(':'))
        return minutes * 60 + seconds
    except:
        return np.nan  # Handle invalid formats

df['duration'] = df['duration'].apply(convert_duration_to_seconds)  # Convert duration to seconds

# Verify cleaned data
print("Cleaned Data Preview:")
print(df[['views', 'duration']].info())
print(df[['views', 'duration']].describe())

# Step 1: Number of unique videos
num_unique_videos = df['title'].nunique()

# Step 2: Number of unique speakers
num_unique_speakers = df['speaker_name'].nunique()

# Step 3: Top 5 viewed videos
top_5_videos = df[['title', 'views']].sort_values(by='views', ascending=False).head(5)

# Display results
result = {
    "Number of Unique Videos": num_unique_videos,
    "Number of Unique Speakers": num_unique_speakers,
    "Top 5 Viewed Videos": top_5_videos
}
print(result)

Cleaned Data Preview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   views     2155 non-null   float64
 1   duration  2155 non-null   float64
dtypes: float64(2)
memory usage: 33.9 KB
None
              views     duration
count  2.155000e+03  2155.000000
mean   2.060034e+06   659.196752
std    2.944472e+06   340.526498
min    1.023100e+04    60.000000
25%    9.884700e+05   343.500000
50%    1.496189e+06   682.000000
75%    2.128472e+06   863.500000
max    6.023746e+07  3503.000000
{'Number of Unique Videos': 2160, 'Number of Unique Speakers': 1915, 'Top 5 Viewed Videos':                                                   title       views
1855  This is what happens when you reply to spam email  60237459.0
1800         Inside the mind of a master procrastinator  40135933.0
2056                 The next outbreak? We're not ready  36342453.0
1136     

In [37]:
# Remove rows where 'views' is NaN
df = df.dropna(subset=['views'])

# Verify the updated data
print("Updated Data Info:")
print(df[['views', 'duration']].info())

# Step 1: Number of unique videos
num_unique_videos = df['title'].nunique()

# Step 2: Number of unique speakers
num_unique_speakers = df['speaker_name'].nunique()

# Step 3: Top 5 viewed videos
top_5_videos = df[['title', 'views']].sort_values(by='views', ascending=False).head(5)

# Display results
result = {
    "Number of Unique Videos": num_unique_videos,
    "Number of Unique Speakers": num_unique_speakers,
    "Top 5 Viewed Videos": top_5_videos
}
print(result)


Updated Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 2155 entries, 0 to 2159
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   views     2155 non-null   float64
 1   duration  2150 non-null   float64
dtypes: float64(2)
memory usage: 50.5 KB
None
{'Number of Unique Videos': 2155, 'Number of Unique Speakers': 1912, 'Top 5 Viewed Videos':                                                   title       views
1855  This is what happens when you reply to spam email  60237459.0
1800         Inside the mind of a master procrastinator  40135933.0
2056                 The next outbreak? We're not ready  36342453.0
1136                     My philosophy for a happy life  35114993.0
1860  What makes a good life? Lessons from the longe...  34095862.0}
