<a href="https://colab.research.google.com/github/virajsoni8899/DataScience/blob/main/Netflix_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#importing neccessary librarys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


**Data Cleaning**

In [None]:
df = pd.read_csv('/content/netflix_india_yt_data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df['description'] = df['description'].fillna('')
df['tags'] = df['tags'].fillna('')
df['likeCount'] = df['likeCount'].fillna(0)
df['commentCount'] = df['commentCount'].fillna(0)

In [None]:
df.isnull().sum()

In [None]:
df['TagsCount'] = df['tags'].apply(lambda x: len(x.split(',')) if x else 0)

In [None]:
df.shape

In [None]:
!pip install isodate
import isodate

# Convert ISO 8601 duration strings to seconds
def convert_duration(duration_str):
    try:
        duration = isodate.parse_duration(duration_str)
        return duration.total_seconds()
    except:
        return 0

df['DurationInSec'] = df['duration'].apply(convert_duration)


In [None]:
df['Category'] = df['DurationInSec'].apply(lambda x: 'Short' if x <= 60 else 'Long')


In [None]:
df.shape

In [None]:
df.head()

**Analysis**

In [None]:
#  Does the duration of the video influence the views and comments ?
correlation_with_views = df['DurationInSec'].corr(df['viewCount'])
correlation_with_comments = df['DurationInSec'].corr(df['commentCount'])

print("Correlation with Views:", correlation_with_views)
print("Correlation with Comments:", correlation_with_comments)

Views vs Duration
The near-zero correlation (~0.0009) suggests that video duration has virtually no impact on how many views a video gets in your dataset. People might be clicking on videos regardless of how long they are.

Comments vs Duration
*A slightly positive correlation (~0.05) means that longer videos might get a few more comments, but the effect is extremely minimal — it’s not statistically meaningful.*

In [None]:
# Is there a relation between the views and comments
correlation = df['viewCount'].corr(df['commentCount'])
print("Correlation:", correlation)

Weak positive correlation: As view count increases, the number of comments tends to increase slightly, but not strongly.

In [None]:
#Does tag count matter to get more views?
tag_views_corr = df['TagsCount'].corr(df['viewCount'])
sns.scatterplot(x='TagsCount', y='viewCount', data=df)
plt.title('Tags Count vs Views')


There's no strong upward trend suggesting that more tags → more views.

In [None]:
# calculating corelation between tagscount and viewcount
df['TagsCount'].corr(df['viewCount'])


No corelation between tagscount and viewcount

In [None]:
#Try grouping videos by tag ranges and compare average views:
#Helps find if there's a sweet spot like “videos with 10–20 tags tend to perform best.”
df['tags_bucket'] = pd.cut(df['TagsCount'], bins=[0, 5, 10, 20, 30, 50])
df.groupby('tags_bucket')['viewCount'].mean().plot(kind='bar', title='Avg Views per Tags Count Bucket')


In [None]:
#Which tags were used in top-performing videos?
top_videos = df.sort_values(by='viewCount', ascending=False).head(10)
top_videos[['title', 'TagsCount', 'tags', 'viewCount']]

*Here’s what stands out:

1. Commonly Repeating Tags:
'netflix india'

'netflix'

'netflix shows'

These appear across almost all top-performing videos, regardless of TagsCount.

2. Kids/Animation Tags:
'Kids', 'kids series', 'animated shows', 'Mighty Little Bheem'

Suggests that kids/animated content performs extremely well on YouTube.

3. Festival/Culture-Specific Tags:
'Diwali', 'pongal', 'Indian', 'netflix india'

These help boost regional and seasonal engagement.

4. Entertainment-specific Tags:
'Heist', 'Netflix 2023', 'Netflix Drama', 'Bella Ciao', 'Money Heist'

These target specific shows/trends, helping with discoverability.*

In [None]:
#Does Publishing Day Matter?
#Yes, it can influence engagement significantly because:

#Audience availability varies through the week.

##🏖️ Weekends: More relaxed, higher leisure time — can lead to more views.

#🧑‍💻 Weekdays: Varies by time zone, but evenings often perform better.
df['publishedAt'] = pd.to_datetime(df['publishedAt'])
df['published_day'] = df['publishedAt'].dt.day_name()
views_by_day = df.groupby('published_day')['viewCount'].mean().sort_values(ascending=False)


In [None]:
views_by_day

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x=views_by_day.index, y=views_by_day.values)
plt.title('Average Views by Day of the Week')
plt.ylabel('Average View Count')
plt.xlabel('Day of the Week')
plt.xticks(rotation=45)
plt.show()

In [None]:
#🕒 Does Publishing Time Matter?
#Absolutely, here's why:

#Posting during peak hours when people are online (e.g., evenings, lunch hours) leads to higher initial engagement — and YouTube often promotes content with fast early traction.

#For India-based viewers, best times might be:

#⏰ Morning (8 AM–10 AM)

#🌆 Evening (6 PM–9 PM)
df['published_hour'] = df['publishedAt'].dt.hour
views_by_hour = df.groupby('published_hour')['viewCount'].mean().sort_values(ascending=False)


In [None]:
views_by_hour

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x=views_by_hour.index, y=views_by_hour.values)
plt.title('Average Views by Hour of the Day')
plt.ylabel('Average View Count')
plt.xlabel('Hour (24hr format)')
plt.show()

In [None]:
pivot_table = df.pivot_table(
    values='viewCount',
    index='published_day',
    columns='published_hour',
    aggfunc='mean'
)

# Optional: Sort the days to show Mon–Sun
ordered_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
pivot_table = pivot_table.reindex(ordered_days)

plt.figure(figsize=(15, 6))
sns.heatmap(pivot_table, cmap='YlGnBu', linewidths=0.5, annot=False)
plt.title('Average Views by Day and Hour')
plt.xlabel('Hour (24hr)')
plt.ylabel('Day of the Week')
plt.show()


In [None]:
df.shape

In [None]:
df.head(3)

In [None]:
### ----------- AVERAGE LIKES BY DAY ------------
likes_by_day = df.groupby('published_day')['likeCount'].mean().reindex(ordered_days)

plt.figure(figsize=(10, 5))
sns.barplot(x=likes_by_day.index, y=likes_by_day.values, palette='crest')
plt.title('📅 Average Likes by Day of the Week')
plt.ylabel('Average Likes')
plt.xlabel('Day of the Week')
plt.xticks(rotation=45)
plt.show()


In [None]:
### ----------- AVERAGE LIKES BY HOUR ------------
likes_by_hour = df.groupby('published_hour')['likeCount'].mean()

plt.figure(figsize=(10, 5))
sns.barplot(x=likes_by_hour.index, y=likes_by_hour.values, palette='viridis')
plt.title('⏰ Average Likes by Hour of the Day')
plt.ylabel('Average Likes')
plt.xlabel('Hour (24hr)')
plt.show()

In [None]:
comments_by_day = df.groupby('published_day')['commentCount'].mean().reindex(ordered_days)

plt.figure(figsize=(10, 5))
sns.barplot(x=comments_by_day.index, y=comments_by_day.values, palette='rocket')
plt.title('📅 Average Comments by Day of the Week')
plt.ylabel('Average Comments')
plt.xlabel('Day of the Week')
plt.xticks(rotation=45)
plt.show()

### ----------- AVERAGE COMMENTS BY HOUR ------------
comments_by_hour = df.groupby('published_hour')['commentCount'].mean()

plt.figure(figsize=(10, 5))
sns.barplot(x=comments_by_hour.index, y=comments_by_hour.values, palette='magma')
plt.title('⏰ Average Comments by Hour of the Day')
plt.ylabel('Average Comments')
plt.xlabel('Hour (24hr)')
plt.show()

In [None]:
#. Which video is most popular on our channel and break down the reasons why it worked well.

top_video = df.loc[df['viewCount'].idxmax()]
print("🎯 Most Popular Video:\n")
print(f"📌 Title       : {top_video['title']}")
print(f"👁️ Views       : {top_video['viewCount']}")
print(f"👍 Likes       : {top_video['likeCount']}")
print(f"💬 Comments    : {top_video['commentCount']}")
print(f"🏷️ Tags        : {top_video['TagsCount']}")
print(f"🕒 Published At: {top_video['publishedAt']}")


# **Break Down WHY It Worked Well**

In [None]:
#Title:
#Does it include a trending topic?

#Is it emotional, curious, or click-worthy?

#Does it include keywords users may search?
print("Title Length:", len(top_video['title'].split()))

In [None]:
#Are tags relevant and popular?
print("Number of Tags:", len(str(top_video['tags']).split(',')))


In [None]:
#Was it short-form or long-form? (Analyze what category performs better on your channel.)
print("Video Duration:", top_video['DurationInSec'])  # if you already processed duration
print("Format Category:", top_video['Category']) # short or long


In [None]:
#When was it posted? Match against your peak hour/day analysis.
print("Published Day:", top_video['publishedAt'].day_name())
print("Published Hour:", top_video['publishedAt'].hour)


In [None]:
#Calculate Engagement Rate
engagement_rate = (top_video['likeCount'] + top_video['commentCount']) / top_video['viewCount']
print(f"📈 Engagement Rate: {engagement_rate:.2%}")

# It hit multiple sweet spots all at once:

✅ Used high-performing tags in abundance.

✅ Posted on a strong weekday (Wednesday) at a high-engagement time (1 PM).

✅ Had an optimized title length and likely included trending or search-friendly terms.

✅ Duration was long, but just short enough (~3.3 min) to stay engaging.

✅ Strong engagement with comments and likes, suggesting it resonated with viewers.

In [None]:
df.head()

In [None]:
# 6. Does Title Length Influence Views?
df['TitleLength'] = df['title'].apply(lambda x: len(str(x).split()))
df.head(3)

In [None]:
plt.figure(figsize=(10, 5))
sns.scatterplot(data=df, x='TitleLength', y='viewCount')
plt.title("Title Length vs. Views")
plt.xlabel("Title Length (Words)")
plt.ylabel("Views")
plt.grid(True)
plt.show()

In [None]:
#checking correlation of both
correlation = df['TitleLength'].corr(df['viewCount'])
print(f"Correlation between Title Length and Views: {correlation}")


# *Conclusion The title of the lenght does not affect the views count*

In [None]:
#. What is our views distribution (consider all the videos)
plt.figure(figsize=(10, 6))
sns.histplot(df['viewCount'], bins=30, kde=True)
plt.title("Distribution of Video Views")
plt.xlabel("Views")
plt.ylabel("Number of Videos")
plt.grid(True)
plt.show()

Highly Skewed Distribution (Right Skewed)
Most videos have very low view counts.

A tiny number of videos have extremely high views (long right tail).

✅ This tells us:

You might be relying heavily on a few viral videos.

There’s a consistency gap — most content isn't engaging the audience.

In [None]:
df['log_views'] = np.log1p(df['viewCount'])  # log1p handles log(0)

# Plot 2: Log scale
plt.subplot(1, 2, 2)
sns.histplot(df['log_views'], bins=50, kde=True, color='orange')
plt.title('Distribution of Video Views (Log Scale)')
plt.xlabel('Log(Views)')
plt.ylabel('Number of Videos')

plt.tight_layout()
plt.show()

In [None]:
df.head(2)

In [None]:
correlation = df['TitleLength'].corr(df['log_views'])
print(f"Correlation between Title Length and  log_Views: {correlation}")

**The length of the title doesn’t significantly impact how many views a video gets.

Viewers are not likely driven by just how long the title is — quality of content, tags, and timing likely matter more.**

In [None]:
#Are there certain topics or themes that consistently perform better than others?
high_perf = df[df['log_views'] >= df['log_views'].quantile(0.75)]
low_perf = df[df['log_views'] <= df['log_views'].quantile(0.25)]
high_perf


In [None]:
!pip install wordcloud
from wordcloud import WordCloud

def create_wordcloud(tag_series, title):
    tags = tag_series.dropna().str.lower().str.replace('|', ' ').str.cat(sep=' ')
    wordcloud = WordCloud(width=1000, height=500, background_color='white').generate(tags)
    plt.figure(figsize=(15, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16)
    plt.show()

# Word cloud for superstar videos
create_wordcloud(high_perf['tags'], "🌟 Top Tags in Superstar Videos")

# Word cloud for sleepy videos
create_wordcloud(low_perf['tags'], "😴 Tags in Sleepy Videos")


In [None]:
no_tags_df = df[df['tags'].str.strip() == '']

# Step 4: Define "performed well" as top 25% views
threshold = df['viewCount'].quantile(0.75)

# Step 5: Find videos with no tags AND high views
no_tags_high_views = no_tags_df[no_tags_df['viewCount'] >= threshold]

# Step 6: Print results
print("Number of videos with no tags but high views:", len(no_tags_high_views))
no_tags_high_views[['title', 'viewCount', 'publishedAt']].sort_values(by='viewCount', ascending=False).head(10)

In [None]:
df.columns

In [None]:
df.head(3)

In [None]:
df['published_date'] = df['publishedAt'].dt.date

In [None]:
df['published_date']

In [None]:
publish_frequency = df['published_date'].value_counts().sort_index()
publish_frequency

In [None]:
df.columns

In [None]:
video_freq_per_day = df['published_date'].value_counts().sort_index()

# Convert to DataFrame for easy viewing
video_freq_df = video_freq_per_day.reset_index()
video_freq_df.columns = ['Date', 'Videos_Uploaded']

In [None]:
plt.figure(figsize=(15, 4))
plt.plot(video_freq_df['Date'], video_freq_df['Videos_Uploaded'], marker='o', linestyle='-')
plt.title("Video Publishing Frequency Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Videos Uploaded")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#Which months are busiest?
df['month'] = df['publishedAt'].dt.to_period('M')
monthly_uploads = df['month'].value_counts().sort_index()

# Plot
monthly_uploads.plot(kind='bar', figsize=(20, 5), title="Monthly Uploads")

In [None]:
#Average uploads per year
df['year'] = df['publishedAt'].dt.year
df['year'].value_counts().sort_index()

In [None]:
upload_views = df.groupby('published_date')['viewCount'].sum()

# Correlation between views and uploads per day
combined = video_freq_per_day.to_frame().join(upload_views)
combined.columns = ['Videos_Uploaded', 'Total_Views']
correlation = combined.corr().iloc[0, 1]
print(f"Correlation between uploads and views per day: {correlation:.2f}")

 Uploading more helps a little, but it’s not the only thing that drives views.

📹 A consistent upload schedule might support channel growth, but quality + content type matter a lot more.

In [None]:
from collections import Counter
tags_series = df['tags'].dropna()
all_tags = [tags.strip().lower() for tags_list in tags_series for tags in tags_list.split(',')]

# Count frequency of each tag
tag_counts = Counter(all_tags)

# Convert to DataFrame for visualization
top_tags_df = pd.DataFrame(tag_counts.most_common(20), columns=['Tag', 'Count'])

# Plot the top 20 tags
plt.figure(figsize=(12, 6))
plt.barh(top_tags_df['Tag'], top_tags_df['Count'], color='skyblue')
plt.xlabel('Count')
plt.title('Top 20 Most Used Tags')
plt.gca().invert_yaxis()
plt.show()

In [None]:
import re

titles_cleaned = df['title'].dropna().str.lower().apply(lambda x: re.sub(r'[^a-z0-9\s]', '', x))

# Split each title into words
all_words = [word for title in titles_cleaned for word in title.split()]

# Optionally, remove very common words (stopwords)
stopwords = {'the', 'a', 'an', 'is', 'of', 'in', 'to', 'for', 'and', 'on', 'with', 'how', 'what', 'why'}
filtered_words = [word for word in all_words if word not in stopwords]

# Count word frequency
word_counts = Counter(filtered_words)

# Convert to DataFrame
top_words_df = pd.DataFrame(word_counts.most_common(20), columns=['Word', 'Count'])

# Plot
plt.figure(figsize=(12,6))
plt.barh(top_words_df['Word'], top_words_df['Count'], color='orange')
plt.xlabel('Count')
plt.title('Top 20 Most Common Words in Video Titles')
plt.gca().invert_yaxis()
plt.show()

In [None]:
print("🔎 Sample of 'viewCount' values:")
print(df['viewCount'].head(10))


In [None]:
# Check type of viewCount column
print("\n🔍 Data type of 'viewCount':", df['viewCount'].dtype)

In [None]:
# Check for commas (string format)
print("\n🔍 Are there any commas in 'viewCount'?")
print(df['viewCount'].astype(str).str.contains(',').any())

In [None]:
# Check how many unique video IDs you have in total
print("\n📊 Total unique video IDs in dataset:")
print(df['video_id'].nunique())

In [None]:
df_millions = df[df['viewCount']>1_000_000]
df_millions

In [None]:
df.columns

In [None]:
dupes = df["video_id"].duplicated().sum()
print(f"Duplicate video IDs: {dupes}")


In [None]:
df_unique = df.drop_duplicates(subset="video_id", keep="first")

In [None]:
# Group by ID and take max viewCount per ID just to be super clean
df_grouped = df_unique.groupby("video_id")["viewCount"].max().reset_index()

# Now filter
final_count = df_grouped[df_grouped["viewCount"] > 1_000_000].shape[0]
print("💯 Rechecked final count:", final_count)
