# imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import urllib
import json

import datetime
from dateutil.relativedelta import relativedelta

import isodate

import altair as alt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans

import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ghant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ghant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Keys

In [2]:
key = "" # removed key. should paste google api key here for the below apis to work

channel_id = "UC0RhatS1pyxInC00YKjjBqQ" #id for geeksforgeeks channel

# fetching playlist upload url

In [3]:
uploads_url = "https://www.googleapis.com/youtube/v3/channels?id="+channel_id+"&key="+key+"&part=contentDetails"

request = urllib.request.urlopen(uploads_url)
response = json.load(request)
uploads_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

# fetching video ids for all the videos within 6 months duration using pagination

In [4]:
videos_url = "https://www.googleapis.com/youtube/v3/playlistItems?playlistId="+uploads_id+"&key="+key+"&part=snippet&pageToken=EAAaBlBUOkNESQ&maxResults=25"

items = list()

request = urllib.request.urlopen(videos_url)
response = json.load(request)

items = items + response['items']

pageToken = response['nextPageToken']

end_date = datetime.datetime.strptime(response['items'][0]['snippet']['publishedAt'],"%Y-%m-%dT%H:%M:%SZ")
start_date = end_date - relativedelta(months=6)

while pageToken != "":
    videos_url = "https://www.googleapis.com/youtube/v3/playlistItems?playlistId="+uploads_id+"&key="+key+"&part=snippet&pageToken="+pageToken+"&maxResults=25"

    request = urllib.request.urlopen(videos_url)
    response = json.load(request)
    
    pageToken = response.get('nextPageToken',"")
    
    items = items + response['items']
    
    if(datetime.datetime.strptime(response['items'][-1]['snippet']['publishedAt'],"%Y-%m-%dT%H:%M:%SZ") <= start_date):
        break

# creating a simple dictionary list, to store url, title, date and id of the videos

## Task 1 solution

In [5]:
videos_data = list()

for item in items:
    if(datetime.datetime.strptime(item['snippet']['publishedAt'],"%Y-%m-%dT%H:%M:%SZ") < start_date):
        break
    video_data = dict()
    video_data['video_id'] = item['snippet']['resourceId']['videoId']
    video_data['video_title'] = item['snippet']['title']
    video_data['pubish_date'] = datetime.datetime.strptime(item['snippet']['publishedAt'],"%Y-%m-%dT%H:%M:%SZ")
    video_data['url'] = "https://www.youtube.com/watch?v=" + item['snippet']['resourceId']['videoId']

    videos_data.append(video_data)

print("videos current date:",videos_data[0]['pubish_date'])
print("videos starting date (6 months before):",videos_data[-1]['pubish_date'])
print("total videos in this duration: ", len(videos_data))

videos current date: 2023-07-11 02:48:41
videos starting date (6 months before): 2023-01-11 12:30:19
total videos in this duration:  161


# fetching statistics data for each video id

In [6]:
videos_data_main = list()

for video in videos_data:

    video_stats_url = "https://youtube.googleapis.com/youtube/v3/videos?part=snippet%2CcontentDetails%2Cstatistics&id="+video['video_id']+"&key="+key
    
    request = urllib.request.urlopen(video_stats_url)
    response = json.load(request)
    
    video['views'] = response['items'][0]['statistics']['viewCount']
    video['duration'] = response['items'][0]['contentDetails']['duration']
    
    videos_data_main.append(video)

# custom dataframe with all the information (duration converted to seconds)

## Task 2 solution

In [7]:
gfofg_data = pd.DataFrame(videos_data_main)

gfofg_data = gfofg_data.drop(columns='video_id', axis=0)

gfofg_data.duration = gfofg_data.duration.apply(lambda x : isodate.parse_duration(x).seconds)
gfofg_data.views = gfofg_data.views.apply(lambda x : int(x))

gfofg_data.head()

Unnamed: 0,video_title,pubish_date,url,views,duration
0,Day 10 | Machine Learning 101 | Geek-O-Lympics...,2023-07-11 02:48:41,https://www.youtube.com/watch?v=34Ady4FD79k,1785,4131
1,60 Seconds TRICK to Optimize your PYTHON Code ...,2023-07-10 15:19:00,https://www.youtube.com/watch?v=cd0iFrCDm7M,921,45
2,Day 9 | The Art of Data Storytelling | How Num...,2023-07-10 01:39:22,https://www.youtube.com/watch?v=baNfsN0SWWk,715,3569
3,Roadmap for Test Automation Engineering | Nite...,2023-07-09 18:46:11,https://www.youtube.com/watch?v=DW8r1fACCLY,1030,3462
4,Day 8 | Power of Python in Data Science | Ashi...,2023-07-09 03:31:19,https://www.youtube.com/watch?v=YPTkw7rr408,1453,3326


# onehot encodings for all the titles (only unigram considered), custom stopwords removal and LDA for topics finding

# only 10 topic are set due to the less dataset constriants

In [8]:
vectorizer = CountVectorizer()

stop_words = list(stopwords.words('english'))
stop_words_custom = ['geeksforgeeks','geek','geeks','#shorts', 'shorts','comments','link','day']
stop_words = stop_words + stop_words_custom

gfofg_data['title'] = gfofg_data['video_title'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

X = vectorizer.fit_transform(gfofg_data['title'])

num_topics = 10
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
topics = lda.fit_transform(X)

topics_list = list()
topics_key_map = dict()

feature_names = vectorizer.get_feature_names_out()
for i, title in enumerate(gfofg_data['title']):
    topic_probs = topics[i]
    top_topic_idx = topic_probs.argmax()
    top_words = [feature_names[j] for j in lda.components_[top_topic_idx].argsort()[:-6:-1]]
    topics_list.append(top_topic_idx+1)
    
    topics_key_map[top_topic_idx+1] = top_words
    
gfofg_data['cluster'] = topics_list

gfofg_data.head(5)

Unnamed: 0,video_title,pubish_date,url,views,duration,title,cluster
0,Day 10 | Machine Learning 101 | Geek-O-Lympics...,2023-07-11 02:48:41,https://www.youtube.com/watch?v=34Ady4FD79k,1785,4131,10 | Machine Learning 101 | Geek-O-Lympics 2023,5
1,60 Seconds TRICK to Optimize your PYTHON Code ...,2023-07-10 15:19:00,https://www.youtube.com/watch?v=cd0iFrCDm7M,921,45,60 Seconds TRICK Optimize PYTHON Code |,7
2,Day 9 | The Art of Data Storytelling | How Num...,2023-07-10 01:39:22,https://www.youtube.com/watch?v=baNfsN0SWWk,715,3569,9 | Art Data Storytelling | Numbers speak Loud...,9
3,Roadmap for Test Automation Engineering | Nite...,2023-07-09 18:46:11,https://www.youtube.com/watch?v=DW8r1fACCLY,1030,3462,Roadmap Test Automation Engineering | Nitesh Jain,4
4,Day 8 | Power of Python in Data Science | Ashi...,2023-07-09 03:31:19,https://www.youtube.com/watch?v=YPTkw7rr408,1453,3326,8 | Power Python Data Science | Ashish Jangra ...,9


# topics associated with highest views (all views are combined for each topic and calculated)

## Task 3 solution

In [9]:
highest_topic = 0
highest_id = 0
for cluster_id in range(1,num_topics+1):
    topic_views = gfofg_data[gfofg_data['cluster'] == cluster_id]['views'].sum()
    if(highest_topic <= topic_views):
        highest_topic = topic_views
        highest_id = cluster_id

top_viewed_topics = topics_key_map[highest_id]

print("topic viewed topics of last 6 months duration are", top_viewed_topics)

topic viewed topics of last 6 months duration are ['master', 'ai', 'coding', 'journey', 'codecamp']


# Topics associated with the lengthiest video

## Task 4 Solution

In [10]:
largest_video_topics_id = gfofg_data.iloc[gfofg_data.duration.argmax()]['cluster']

largest_video_topics = topics_key_map[largest_video_topics_id]

print("Topics with largest duration video", largest_video_topics)

Topics with largest duration video ['tech', 'gate', 'solving', 'india', 'hackathon']


# Interactive chart for views vs duration (can zoom in, hover and drag the chart to see details clearly)

## Task 5 Solution

In [11]:
alt.Chart(gfofg_data).mark_circle(size=60).encode(
    y='views',
    x='duration',
    tooltip=['video_title', 'pubish_date', 'views', 'duration']
).properties(
    width=800,
    height=300
).interactive()