In [None]:
# TEXT MINING on YouTube Comments


import pandas as pd
import re
from collections import Counter
from nltk.corpus import stopwords

# Uncomment these lines if running for the first time
# import nltk
# nltk.download('stopwords')

# Load dataset
file_path = 'path_to_youtube_dataset.csv'  # Replace with your file path
youtube_data = pd.read_csv(file_path)

# Ensure the dataset contains a 'Comments' column
if 'Comments' in youtube_data.columns:
    comments = youtube_data['Comments'].dropna()
else:
    raise ValueError("The dataset does not contain a 'Comments' column.")

# Text preprocessing
def preprocess_text(text):
    # Remove URLs, mentions, hashtags, special characters, and numbers
    text = re.sub(r'http\S+|@\S+|#\S+|[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

youtube_data['Cleaned Comments'] = comments.apply(preprocess_text)

# Combine all comments into one string for word frequency analysis
all_comments = ' '.join(youtube_data['Cleaned Comments'])

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in all_comments.split() if word not in stop_words]

# Most common words
word_counts = Counter(filtered_words)
common_words = word_counts.most_common(10)

# Sentiment analysis
from textblob import TextBlob

youtube_data['Sentiment'] = youtube_data['Cleaned Comments'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Outputs
# 1. Cleaned comments
cleaned_comments = youtube_data['Cleaned Comments'].tolist()

# 2. Most common words
print("Most Common Words and Their Frequencies:")
for word, freq in common_words:
    print(f"{word}: {freq}")

# 3. Sentiments
print("\nSentiments for Comments:")
for i, sentiment in enumerate(youtube_data['Sentiment']):
    print(f"Comment {i + 1}: Sentiment Polarity = {sentiment}")


In [5]:
# Text Mining on Video Publish Time and Day Of Week

import pandas as pd

# Load dataset
file_path = 'youtube_channel_real_performance_analytics.csv'  # Replace with your file path
youtube_data = pd.read_csv(file_path)

# Ensure necessary columns are present
if 'Video Publish Time' not in youtube_data.columns or 'Day of Week' not in youtube_data.columns:
    raise ValueError("The dataset does not contain required columns: 'Video Publish Time' or 'Day of Week'.")

# Extract components from 'Video Publish Time'
youtube_data['Publish Hour'] = pd.to_datetime(youtube_data['Video Publish Time']).dt.hour
youtube_data['Publish Month'] = pd.to_datetime(youtube_data['Video Publish Time']).dt.month_name()
youtube_data['Publish Year'] = pd.to_datetime(youtube_data['Video Publish Time']).dt.year

# Output extracted data
video_publish_time_output = youtube_data[['Video Publish Time', 'Publish Hour', 'Publish Month', 'Publish Year', 'Day of Week']]

print(video_publish_time_output)
# Save to a CSV file
# output_file_path = 'youtube_video_publish_time_analysis.csv'
# video_publish_time_output.to_csv(output_file_path, index=False)

# print(f"Results saved to {output_file_path}")


      Video Publish Time  Publish Hour Publish Month  Publish Year Day of Week
0    2016-06-02 00:00:00             0          June          2016    Thursday
1    2016-06-10 00:00:00             0          June          2016      Friday
2    2016-06-14 00:00:00             0          June          2016     Tuesday
3    2016-06-29 00:00:00             0          June          2016   Wednesday
4    2016-07-01 00:00:00             0          July          2016      Friday
..                   ...           ...           ...           ...         ...
359  2024-08-25 00:00:00             0        August          2024      Sunday
360  2024-09-01 00:00:00             0     September          2024      Sunday
361  2024-09-16 00:00:00             0     September          2024      Monday
362  2024-09-25 00:00:00             0     September          2024   Wednesday
363  2024-10-18 00:00:00             0       October          2024      Friday

[364 rows x 5 columns]


In [None]:
# Text Mining on Video Publish Time and Day Of Week Visualization

import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns

# Load dataset
file_path = 'path_to_youtube_dataset.csv'  # Replace with your file path
youtube_data = pd.read_csv(file_path)

# Extract components from 'Video Publish Time'
youtube_data['Publish Hour'] = pd.to_datetime(youtube_data['Video Publish Time']).dt.hour
youtube_data['Publish Month'] = pd.to_datetime(youtube_data['Video Publish Time']).dt.month_name()
youtube_data['Publish Year'] = pd.to_datetime(youtube_data['Video Publish Time']).dt.year

# Generate word clouds
def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16)
    plt.show()

# Word Cloud for 'Day of Week'
text_day_of_week = ' '.join(youtube_data['Day of Week'])
generate_wordcloud(text_day_of_week, "Word Cloud: Days of the Week")

# Word Cloud for 'Publish Month'
text_publish_month = ' '.join(youtube_data['Publish Month'])
generate_wordcloud(text_publish_month, "Word Cloud: Publish Months")

# Frequency Analysis: Publish Hour
publish_hour_counts = youtube_data['Publish Hour'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
sns.barplot(x=publish_hour_counts.index, y=publish_hour_counts.values, palette='coolwarm')
plt.title('Frequency of Videos Published by Hour', fontsize=16)
plt.xlabel('Publish Hour', fontsize=12)
plt.ylabel('Number of Videos', fontsize=12)
plt.xticks(range(0, 24))
plt.show()

# Frequency Analysis: Days of the Week
day_counts = youtube_data['Day of Week'].value_counts()

plt.figure(figsize=(12, 6))
sns.barplot(x=day_counts.index, y=day_counts.values, palette='viridis')
plt.title('Frequency of Videos Published by Day of the Week', fontsize=16)
plt.xlabel('Day of the Week', fontsize=12)
plt.ylabel('Number of Videos', fontsize=12)
plt.xticks(rotation=45)
plt.show()
