In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
df = pd.read_csv('final_merged.csv')
df['description_video'] = df['description_video'].fillna('')

# Extract video titles, descriptions, and tags
titles = df['title_of_video']
descriptions = df['description_video']
tags = df['tags']

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer on the titles, descriptions, and tags
tfidf_vectorizer.fit(titles)
tfidf_vectorizer.fit(descriptions)
tfidf_vectorizer.fit(tags)

# Transform the titles, descriptions, and tags into TF-IDF matrices
title_matrix = tfidf_vectorizer.transform(titles)
description_matrix = tfidf_vectorizer.transform(descriptions)
tag_matrix = tfidf_vectorizer.transform(tags)

# Calculate the average TF-IDF score per word in the title, description, and tags
title_scores = title_matrix.mean(axis=0)
description_scores = description_matrix.mean(axis=0)
tag_scores = tag_matrix.mean(axis=0)

# Calculate the top 10 most relevant words in the title, description, and tags
title_top_words = tfidf_vectorizer.get_feature_names()[title_scores.argsort()[0][-10:]]
description_top_words = tfidf_vectorizer.get_feature_names()[description_scores.argsort()[0][-10:]]
tag_top_words = tfidf_vectorizer.get_feature_names()[tag_scores.argsort()[0][-10:]]

# Print the top 10 most relevant words in the title, description, and tags
print('Top 10 most relevant words in video titles:', title_top_words)
print('Top 10 most relevant words in video descriptions:', description_top_words)
print('Top 10 most relevant words in video tags:', tag_top_words)

# Calculate the average length of the video titles and descriptions
title_lengths = titles.str.len().mean()
description_lengths = descriptions.str.len().mean()

# Print the average length of the video titles and descriptions
print('Average length of video titles:', title_lengths)
print('Average length of video descriptions:', description_lengths)

# Calculate the


TypeError: only integer scalar arrays can be converted to a scalar index

In [6]:
import pandas as pd

# Load data
data = pd.read_csv('final_merged.csv')

# Convert duration to seconds
data['duration_video'] = pd.to_timedelta(data['duration_video']).dt.total_seconds()

# Calculate watch time
data['watch_time'] = data['view_count_video'] * data['duration_video']

# Calculate engagement
data['engagement'] = data['likes_video']  + data['Total_Comments']

# Print top 10 videos by watch time
top_10_videos = data[['title_of_video', 'watch_time']].sort_values('watch_time', ascending=False).head(10)
print(top_10_videos)

# Print top 10 videos by engagement
top_10_engagement = data[['title_of_video', 'engagement']].sort_values('engagement', ascending=False).head(10)
print(top_10_engagement)


                                        title_of_video  watch_time
203                             FIRST LOVE @YogiBabaOG    7.294291
204  PEN FIGHT Ft. PUFF TALKS @PuffTalks  | ANIMATI...    6.550916
199              JAVELIN THROW! Ft. SHAADI WAALE UNCLE    6.302207
198                        KENDRIYA VIDYALAYA IS SCARY    5.470834
205      Big Dreams ft. Parents @KirtiChow  @RIYAGOGOI    4.820842
197   ANGER ISSUES | ANIMATION STORY |  RG BUCKET LIST    4.049873
196  PEER PRESSURE FT. CRICKET | ANIMATION STORY | ...    3.071843
193  FIRST LOVE (Part-2) | Official trailer |@Kirti...    1.708102
202  PUBLIC TRANSPORTATION | ANIMATION VIDEO | ASSA...    1.573231
189  My first India’s Premium Luxury Train Journey ...    1.465255
                                        title_of_video  engagement
199              JAVELIN THROW! Ft. SHAADI WAALE UNCLE     1113392
203                             FIRST LOVE @YogiBabaOG      969935
204  PEN FIGHT Ft. PUFF TALKS @PuffTalks  | ANIMATI...      69

In [8]:

import numpy as np

# Fill missing values with empty string
data = data.fillna('')

# Create a new column for SEO keywords
data['seo_keywords'] = data['title_of_video'] + ' ' + data['description_video']

# Create a new column for collaboration count
data['collaboration_count'] = data['tags'].apply(lambda x: len(x.split('|')))

# Calculate average views for each SEO keyword
grouped = data.groupby('seo_keywords').agg({'view_count_video': [np.mean, np.size]})
grouped.columns = ['average_views', 'keyword_count']
grouped = grouped.sort_values('average_views', ascending=False).reset_index()

# Print top 10 SEO keywords
top_10_seo_keywords = grouped[['seo_keywords', 'average_views']].head(10)
print(top_10_seo_keywords)

# Calculate average views for each collaboration count
grouped = data.groupby('collaboration_count').agg({'view_count_video': [np.mean, np.size]})
grouped.columns = ['average_views', 'collaboration_count']
grouped = grouped.sort_values('collaboration_count').reset_index()

# Print collaboration analysis
collaboration_analysis = grouped[['collaboration_count', 'average_views']]
print(collaboration_analysis)


                                        seo_keywords  average_views
0  JAVELIN THROW! Ft. SHAADI WAALE UNCLE Download...     16541225.0
1  PEN FIGHT Ft. PUFF TALKS @PuffTalks  | ANIMATI...     13314870.0
2  FIRST LOVE @YogiBabaOG Download Winzo:\nhttps:...     13095675.0
3  Big Dreams ft. Parents @KirtiChow  @RIYAGOGOI ...      9919427.0
4  KENDRIYA VIDYALAYA IS SCARY Register with Coin...      8809716.0
5  ANGER ISSUES | ANIMATION STORY |  RG BUCKET LI...      8214752.0
6  FIRST LOVE (Part-2) | Official trailer |@Kirti...      7491674.0
7  PEER PRESSURE FT. CRICKET | ANIMATION STORY | ...      6965630.0
8  PUBLIC TRANSPORTATION | ANIMATION VIDEO | ASSA...      3633327.0
9  Who PAYS for the FIRST DATE? When is the last ...      3518414.0


ValueError: 'collaboration_count' is both an index level and a column label, which is ambiguous.