# Loading the model and libraries

In [1]:
import requests
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from scipy.special import softmax

# Define API key and model details
API_KEY = 'youtube_api'
MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'


# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
config = AutoConfig.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Searching based on either: 
### 1) search term 
### 2) Manually provide links to the youtube videos

In [2]:
# SEARCH-TERM BASED CODE: Enter the search term, specify the max-results(videos) to extract the comments from

params = (
    ('key', 'youtube_api'),
    ('part', 'snippet'),
    ('type', 'video'),
    ('maxResults', 1), # specify the maxresults
    ('q', 'The rise of India'), #enter the search term to search the youtube
    ('relevanceLanguage', 'en'), 
)

response = requests.get('https://www.googleapis.com/youtube/v3/search', params=params)

response_json=response.json()



channel_ids = []
videoid_name = {}
for i in range(len(response_json['items'])):
    channel_ids.append(response_json['items'][i]['snippet']['channelId'])
    videoid_name[response_json['items'][i]['snippet']['title']] = response_json['items'][i]['id']['videoId']

# Print the extracted information
print("Channel IDs:", channel_ids)
print("Videoid Name:", videoid_name)


Channel IDs: ['UCPwnLbOpvwf3hCx9SedMrsw']
Videoid Name: {'The rise of India - Epic version': 'HAzWxZGrqFA'}


In [3]:
# Or manually provide the links yourself

import requests

# Replace with your actual YouTube API key
API_KEY = 'youtube_api'

# Manually provide a list of video URLs
video_urls = ['https://www.youtube.com/watch?v=NZyRGM-VvP4', 'https://www.youtube.com/watch?v=sRX7wNsjAKg']

# Initialize channel_ids and videoid_name
channel_ids = []
videoid_name = {}

# Loop through each video URL
for video_url in video_urls:
    # Parse the video ID from the URL
    video_id = video_url.split('v=')[1]

    # Use the video ID to fetch video details
    params = {
        'key': API_KEY,
        'part': 'snippet',
        'id': video_id,
    }

    response = requests.get('https://www.googleapis.com/youtube/v3/videos', params=params)
    response_json = response.json()

    # Extract channel ID and video name
    channel_id = response_json['items'][0]['snippet']['channelId']
    video_name = response_json['items'][0]['snippet']['title']

    # Append channel ID to channel_ids list
    channel_ids.append(channel_id)

    # Add video name and video ID to videoid_name dictionary
    videoid_name[video_name] = video_id

# Print the extracted information
print("Channel IDs:", channel_ids)
print("Videoid Name:", videoid_name)


Channel IDs: ['UC-sbTMR8bzv0YIdKNhlyLDw', 'UC-sbTMR8bzv0YIdKNhlyLDw']
Videoid Name: {'15 LEGIT Amazon FBA Hacks // From One of the Smartest Amazon Sellers - Jon Derkits': 'NZyRGM-VvP4', '3 Paths to Amazon FBA Success in 2024 (and beyond)': 'sRX7wNsjAKg'}


#### A filter to only select the english videos

In [5]:
import langid
videos_required=[]
for name in videoid_name.keys():
    lang = langid.classify(name)
    #print("Lang: ", lang, "Name: ", name)
    if lang[0] == 'en':
        videos_required.append(videoid_name.get(name))


print("Number of English videos: ", len(videos_required))
print(videos_required)

Number of English videos:  2
['NZyRGM-VvP4', 'sRX7wNsjAKg']


##### Preprocess function

In [8]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

#### Passing each individual comment to this function and the sentiment score is calculated

In [9]:
# Function to analyze sentiment
def analyze_sentiment(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    output = model(**encoded_input)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores, axis=0)

    # Labels corresponding to sentiment classes
    labels = ["negative", "neutral", "positive"]

    json_sent = {
        "label": labels[np.argmax(scores)],
        "probability": {
            "neg": scores[0],  # Negative sentiment score
            "neutral": scores[1],  # Neutral sentiment score
            "pos": scores[2]  # Positive sentiment score
        }
    }

    return json_sent


In [10]:
# Initialize DataFrame
df = pd.DataFrame(columns=['textDisplay', 'video_id', 'label', 'pos', 'neg', 'neutral', 'numbering'])


# Initialize a counter for the DataFrame index
index_counter = 0

# Iterate through videos and comments
for i, video in enumerate(videos_required):
    params_v = {
        'key': API_KEY,
        'part': 'snippet',
        'videoId': video,
        'maxResults': '100',
        
    }

    response_v = requests.get('https://www.googleapis.com/youtube/v3/commentThreads', params=params_v)
    response_json_v = response_v.json()
    print("This is response_v: ", response_json_v)


        # Initialize the numbering counter for each video
    numbering_counter = 1
    
    for j, item in enumerate(response_json_v['items']):
        comment = item['snippet']['topLevelComment']['snippet']['textOriginal']
        vid_id = item['snippet']['topLevelComment']['snippet']['videoId']
        print("Here is the Comment: ", comment)

        json_sent = analyze_sentiment(comment)
        print("SCORE IS RETURNED", json_sent)

        lst = [comment, vid_id, json_sent['label'], json_sent["probability"]["pos"], json_sent["probability"]["neg"], json_sent["probability"]["neutral"], numbering_counter]
        print("Updated lst dict", lst)

        df.loc[index_counter] = lst

        print("appended to the df")
        numbering_counter += 1  # Increment the numbering_counter

        print("printing df", df)
        index_counter += 1  # Increment the index_counter



This is response_v:  {'kind': 'youtube#commentThreadListResponse', 'etag': '5cAwXA7eym_g2AWdCFMGW11lpCA', 'pageInfo': {'totalResults': 18, 'resultsPerPage': 100}, 'items': [{'kind': 'youtube#commentThread', 'etag': 'W7-e8au_b7ukpEK13iWIht3Esus', 'id': 'UgwjeFBtsXjGbeOoIDF4AaABAg', 'snippet': {'channelId': 'UC-sbTMR8bzv0YIdKNhlyLDw', 'videoId': 'NZyRGM-VvP4', 'topLevelComment': {'kind': 'youtube#comment', 'etag': 'kVb9w3V78wp8ycM8ZXUyviEhE38', 'id': 'UgwjeFBtsXjGbeOoIDF4AaABAg', 'snippet': {'channelId': 'UC-sbTMR8bzv0YIdKNhlyLDw', 'videoId': 'NZyRGM-VvP4', 'textDisplay': 'Get the Slides 👉  <a href="https://bit.ly/15amazonfbahacks">https://bit.ly/15amazonfbahacks</a><br>Subscribe to Jon&#39;s Newsletter (seriously...do it)  👉 <a href="https://bit.ly/derkits">https://bit.ly/derkits</a><br>Jon on Twitter 👉 <a href="https://twitter.com/guyfosel">https://twitter.com/guyfosel</a><br>Jon&#39;s Free Course 👉  <a href="https://zaap.bio/jon-derkits/amazon-p..">https://zaap.bio/jon-derkits/amazon-

##### Save the output in a separate csv file

In [11]:
df.to_csv('output.csv', index=True)
df.head(10)

Unnamed: 0,textDisplay,video_id,label,pos,neg,neutral,numbering
0,Get the Slides 👉 https://bit.ly/15amazonfbaha...,NZyRGM-VvP4,neutral,0.426034,0.003782,0.570184,1
1,@adam @JonDerkits ... I told my Mastermind on ...,NZyRGM-VvP4,positive,0.829468,0.009579,0.160953,2
2,I almost gave up on learning anything new on Y...,NZyRGM-VvP4,positive,0.719775,0.059298,0.220927,3
3,Perfect!,NZyRGM-VvP4,positive,0.945496,0.010617,0.043887,4
4,This was a big boy video. I noted down like 8 ...,NZyRGM-VvP4,positive,0.885591,0.010211,0.104197,5
5,"Thanks, great video.\r\nRegarding hack #9. I d...",NZyRGM-VvP4,neutral,0.168167,0.203229,0.628604,6
6,This was another great session! Thank you!\nI ...,NZyRGM-VvP4,positive,0.879581,0.013089,0.10733,7
7,"Amazing tips! And Adam, thanks for another gre...",NZyRGM-VvP4,positive,0.986128,0.004672,0.0092,8
8,"Great interview Adam, lots of valuable stuff. ...",NZyRGM-VvP4,positive,0.901731,0.005078,0.093191,9
9,I’ve been in this game for about 12 years. I o...,NZyRGM-VvP4,positive,0.909592,0.007303,0.083105,10


##### Calculating the overall sentiment of the comment section

In [12]:
# Calculate weighted average for pos, neg, and neutral columns
weighted_avg_pos = df['pos'].mean()
print("weighted_avg_pos ", weighted_avg_pos)
weighted_avg_neg = df['neg'].mean()
print("weighted_avg_neg ", weighted_avg_neg)

weighted_avg_neutral = df['neutral'].mean()
print("weighted_avg_neutral ", weighted_avg_neutral)


# Determine the label based on the greatest weighted average
max_weighted_avg = max(weighted_avg_pos, weighted_avg_neg, weighted_avg_neutral)
print("max_weighted_avg ",max_weighted_avg)
label = None
if max_weighted_avg == weighted_avg_pos:
    label = 'positive'
elif max_weighted_avg == weighted_avg_neg:
    label = 'negative'
else:
    label = 'neutral'

# Print the determined label
print(f"The Overall determined sentiment label is: {label}")


weighted_avg_pos  0.7152702
weighted_avg_neg  0.09695927
weighted_avg_neutral  0.18777053
max_weighted_avg  0.7152702
The Overall determined sentiment label is: positive


##### Calculating the absolute number of positive/negative/neutral comments

In [13]:
# Calculate the total number of comments
total_comments = len(df)

# Count the number of positive, negative, and neutral comments
num_positive_comments = df[df['label'] == 'positive']['textDisplay'].count()
num_negative_comments = df[df['label'] == 'negative']['textDisplay'].count()
num_neutral_comments = df[df['label'] == 'neutral']['textDisplay'].count()

# Print the results
print(f"Total Comments: {total_comments}")
print(f"Number of Positive Comments out of {total_comments} = {num_positive_comments}")
print(f"Number of Negative Comments out of {total_comments} = {num_negative_comments}")
print(f"Number of Neutral Comments out of {total_comments} = {num_neutral_comments}")


Total Comments: 39
Number of Positive Comments out of 39 = 29
Number of Negative Comments out of 39 = 3
Number of Neutral Comments out of 39 = 7


##### Calculating the percentage of positive/negative/neutral comments

In [14]:
# Calculate the percentages of positive, negative, and neutral comments
percentage_positive = (num_positive_comments / total_comments) * 100
percentage_negative = (num_negative_comments / total_comments) * 100
percentage_neutral = (num_neutral_comments / total_comments) * 100

# Print the results
print(f"Percentage of Positive Comments: {percentage_positive:.2f}%")
print(f"Percentage of Negative Comments: {percentage_negative:.2f}%")
print(f"Percentage of Neutral Comments: {percentage_neutral:.2f}%")


Percentage of Positive Comments: 74.36%
Percentage of Negative Comments: 7.69%
Percentage of Neutral Comments: 17.95%


##### The most common words that appeared in the positive comments

In [15]:
from nltk import FreqDist
import operator

import re
#the words that appear he most in positive reviews
import nltk
porter = nltk.PorterStemmer()
list_pos=[]
for i in range(len(df.loc[df['label'] == 'positive'])):
    list_pos.append(df.loc[df['label'] == 'positive']["textDisplay"].iloc[i])
lst_words_pos = []
for line in list_pos:
    text_pos = re.split('\n| |\?|\!|\:|\"|\(|\)|\...|\;',line)
    for word in text_pos:
        if (len(word)>3 and not word.startswith('@') and not word.startswith('#') and word != 'RT'):
            lst_words_pos.append(porter.stem(word.lower()))


dist_pos = FreqDist(lst_words_pos) 
sorted_dist_pos = sorted(dist_pos.items(), key=operator.itemgetter(1), reverse=True)
sorted_dist_pos[:50]


[('great', 10),
 ('video', 9),
 ('thi', 8),
 ('thank', 6),
 ('love', 6),
 ('your', 4),
 ('been', 4),
 ('it’', 4),
 ('that', 4),
 ('anoth', 4),
 ('adam,', 4),
 ('then', 3),
 ('with', 3),
 ('hank', 3),
 ('amazon', 3),
 ('about', 3),
 ('adam', 3),
 ('more', 3),
 ('product', 3),
 ('strike', 2),
 ('thru', 2),
 ('hack', 2),
 ('work', 2),
 ('review', 2),
 ('next', 2),
 ('explain', 2),
 ('miss', 2),
 ('price', 2),
 ('down', 2),
 ('nice', 2),
 ('tri', 2),
 ('awesom', 2),
 ('good', 2),
 ('when', 2),
 ('valuabl', 2),
 ('brand', 2),
 ('have', 2),
 ('such', 2),
 ('these', 2),
 ('what', 2),
 ('couldn’t', 2),
 ('agre', 2),
 ('make', 2),
 ('relaunch', 2),
 ('feel', 2),
 ('let', 2),
 ('fire', 2),
 ('veri', 2),
 ('told', 1),
 ('mastermind', 1)]

##### A list of the common words that appeared in the negative comments

In [16]:
list_neg=[]
for i in range(len(df.loc[df['label'] == 'negative'])):
    list_neg.append(df.loc[df['label'] == 'negative']["textDisplay"].iloc[i])
lst_words_neg = []
for line in list_neg:
    text_neg = re.split('\n| |\?|\!|\:|\"|\(|\)|\...|\;',line)
    for word in text_neg:
        if (len(word)>3 and not word.startswith('@') and not word.startswith('#') and word != 'RT'):
            lst_words_neg.append(porter.stem(word.lower()))
dist_neg = FreqDist(lst_words_neg) 
sorted_dist_neg = sorted(dist_neg.items(), key=operator.itemgetter(1), reverse=True)
sorted_dist_neg[:50]

[('launch', 2),
 ('long', 1),
 ('wait', 1),
 ('fuck', 1),
 ('lord', 1),
 ('boss', 1),
 ('it’', 1),
 ('expens', 1),
 ('days,', 1),
 ('mayb', 1),
 ('design', 1),
 ('patent', 1),
 ('help', 1)]