# You Tube Data Fetch Using API

In [None]:
import numpy as np              
import pandas as pd        
import IPython.display             #for displaying objects in different formats
import googleapiclient.discovery   #allows interaction with Google APIs


from dateutil import parser        #helps in parsing dates from strings in various formats and converting them into datetime objects in Python
import isodate

# Data visualization libraries
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
sns.set(style="darkgrid", color_codes=True)

from googleapiclient.discovery import build        #connection to a particular Google API service
from textblob import TextBlob            # It provides a simple API for common natural language processing (NLP) tasks such as Sentiment analysis


In [None]:
Api_key = 'API ID'     

In [None]:
channel_ids = ['Video Id' ]

In [None]:
api_service_name = "youtube"
api_version = "v3"
    
    # Get credentials and create an API client
youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey=Api_key)



In [None]:
#get the channel stats using the channel id 
def get_channel_stats(youtube,channel_ids):
    
    """
    Get channel stats
    
    Params:
    ------
    youtube: build object of Youtube API
    channel_ids: list of channel IDs
    
    Returns:
    ------
    dataframe with all channel stats for each channel ID
    
    """
    
    all_data = []
    
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel_ids)
    )
    response = request.execute()

    # loop through items
    for item in response['items']:
        data = {'channelName': item['snippet']['title'],
                'subscribers': item['statistics']['subscriberCount'],
                'views': item['statistics']['viewCount'],
                'totalVideos': item['statistics']['videoCount'],
                'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
        }
        
        all_data.append(data)
        
    return pd.DataFrame(all_data)

In [None]:
channel_stats = get_channel_stats(youtube,channel_ids)

In [None]:
channel_stats

In [None]:
#Extracting the videos_id by playlist ID
playlist_id = channel_stats["playlistId"][0]
def get_video_ids(youtube, playlist_id):
    
    video_ids = []
    
    request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        playlistId=playlist_id,
        maxResults = 50
    )
    response = request.execute()
    
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        request = youtube.playlistItems().list(
                    part='contentDetails',
                    playlistId = playlist_id,
                    maxResults = 50,
                    pageToken = next_page_token)
        response = request.execute()

        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])

        next_page_token = response.get('nextPageToken')
        
    return video_ids
        

   

In [None]:
video_ids = get_video_ids(youtube,playlist_id)

In [None]:
len(video_ids)

In [None]:
#Extract the video detail by video_id
def get_video_details(youtube, video_ids):

    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption'],
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
    
    return pd.DataFrame(all_video_info)

In [None]:
# Get video details
video_df = get_video_details(youtube, video_ids)
video_df

In [None]:
#Extract the data in csv format
video_df.to_csv('Video_Details(Vlad and Niki).csv')

In [None]:
#getting the comments
def get_comments_in_videos(youtube, video_ids):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with video IDs and associated top level comment in text.
    
    """
    all_comments = []
    
    for video_id in video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)     

In [None]:
comment_df = get_comments_in_videos(youtube, video_ids)

In [None]:
len(comment_df)

In [None]:
#Extract Comment
#Extract the data in csv format
comment_df.to_csv('comment(MrBeast).csv')

# Data Cleaning

In [None]:
df = pd.read_csv('Video_Details(PewDiePie).csv',lineterminator='\n')

In [None]:
df.drop("favouriteCount", axis=1, inplace=True)

In [None]:
df.isnull().any()

In [None]:
df['description'].fillna("No description",inplace=True)
df['tags'].fillna("No tags",inplace = True)

df['commentCount'].fillna(0,inplace= True)
df['likeCount'].fillna(df['likeCount'].median(),inplace = True)

In [None]:
df.isnull().any()

In [None]:
cols = ['viewCount', 'likeCount', 'commentCount']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [None]:
# Create publish day (in the week) column
df['publishedAt'] =  df['publishedAt'].apply(lambda x: parser.parse(x)) 
df['pushblishDayName'] = df['publishedAt'].apply(lambda x: x.strftime("%A"))

In [None]:
# convert duration to seconds
df['durationSecs'] = df['duration'].apply(lambda x: isodate.parse_duration(x))
df['durationSecs'] = df['durationSecs'].astype('timedelta64[s]')

In [None]:
# Add number of tags
df['tagsCount'] = df['tags'].apply(lambda x: 0 if x is None else len(x))

In [None]:
# Comments and likes per 1000 view ratio
df['likeRatio'] = df['likeCount']/ df['viewCount'] * 1000
df['commentRatio'] = df['commentCount']/ df['viewCount'] * 1000

In [None]:
# Title character length
df['titleLength'] = df['title'].apply(lambda x: len(x))

In [None]:
df.to_csv('Video_Details_clean_file(PewDiePie).csv')

# Data Merge

In [None]:
import pandas as pd

# List of file paths for the CSV files
file_paths = [
  "E:\\Great learning All Data Analytise Material\\final live project\\Comedy_youtube_data.csv",
  "E:\\Great learning All Data Analytise Material\\final live project\\Films_youtube_data.csv",
  "E:\\Great learning All Data Analytise Material\\final live project\\Finance_youtube_data.csv",
  "E:\\Great learning All Data Analytise Material\\final live project\\Shopping_youtube_data.csv",
  "E:\\Great learning All Data Analytise Material\\final live project\\Gaming_youtube_data.csv"
]

# Read each CSV file and store them in a list of dataframes
dfs = []
for file_path in file_paths:
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all dataframes into a single dataframe
concatenated_df = pd.concat(dfs, ignore_index=True)

# Write the concatenated dataframe to a new CSV file
output_file_path = 'merged_file2.csv'
concatenated_df.to_csv(output_file_path, index=False)

print("Merged CSV file has been created successfully at:", output_file_path)


# Sentiment Analysis

In [None]:


# Set up YouTube API key and build service
API_KEY = 'AIzaSyAYFsIiNVuar2ruuDER36UUzzYpzoh18cA'
youtube = build('youtube', 'v3', developerKey=API_KEY)

def get_video_comments(video_id):
    comments = []
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        textFormat="plainText",
        maxResults=100
    )
    while request:
        response = request.execute()
        for item in response["items"]:
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)
        request = youtube.commentThreads().list_next(request, response)
    return comments

def analyze_sentiment(comments):
    positive = 0
    negative = 0
    neutral = 0
    total_comments = len(comments)
    
    for comment in comments:
        analysis = TextBlob(comment)
        if analysis.sentiment.polarity > 0:
            positive += 1
        elif analysis.sentiment.polarity < 0:
            negative += 1
        else:
            neutral += 1
    
    # Calculate percentages
    positive_percent = (positive / total_comments) * 100
    negative_percent = (negative / total_comments) * 100
    neutral_percent = (neutral / total_comments) * 100
    
    sentiment_percentages = {
        'positive': positive_percent,
        'negative': negative_percent,
        'neutral': neutral_percent
    }
    return sentiment_percentages

if __name__ == "__main__":
    video_id = "YlvcFJOE-OE"
    comments = get_video_comments(video_id)
    sentiment_percentages = analyze_sentiment(comments)
    print("Sentiment Analysis Results:")
    print("Positive Comments Percentage:", sentiment_percentages['positive'], "%")
    print("Negative Comments Percentage:", sentiment_percentages['negative'], "%")
    print("Neutral Comments Percentage:", sentiment_percentages['neutral'], "%")
 