In [None]:
import os
import googleapiclient.discovery
import pandas as pd
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import string
import re
from urllib.parse import urlparse
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
import json
import urllib.request
import random
import pickle
import random
from urllib.parse import urlparse
from urllib.parse import parse_qs

In [None]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
global vect
vect = CountVectorizer()

In [None]:
def get_videoId(video_link=None):
    if video_link is None:
        print("Please provide a valid video Id.")
        print("Exiting ....")
        return None
    url_data = urlparse(video_link)
    video_id = parse_qs(url_data.query)['v'][0]
    

    return video_id

In [None]:
def remove_stopwords(line):
    word_tokens = word_tokenize(line)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return " ".join(filtered_sentence)

In [None]:
def add_polarity(comments):
    comments['polarity'] = comments['Comments'].apply(lambda x: TextBlob(x).sentiment.polarity)
    comments = comments.sample(frac=1).reset_index(drop=True)
    comments['pol_cat']  = 0
    comments['pol_cat'][comments.polarity > 0] = 1
    comments['pol_cat'][comments.polarity <= 0] = -1

    # Converting comments in lowercase and removing leading and trailing extra spaces
    comments['Comments'] = comments['Comments'].str.lower()
    
    # Removing stopwords
    comments['stop_comments'] = comments['Comments'].apply(lambda x : remove_stopwords(x))
    return comments

In [None]:
def create_dataframe(response, video_id, training_data = False):
  authorname = []
  comments = []
  for i in range(len(response["items"])):
    authorname.append(response["items"][i]["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"])
    comments.append(response["items"][i]["snippet"]["topLevelComment"]["snippet"]["textOriginal"])
  df = pd.DataFrame(comments, index = authorname,columns=["Comments"])

  df = add_polarity(df)
  if training_data:
      return df

  df.to_csv(f"video_comments_{video_id}.csv")

  return df

In [None]:
def get_comments(video_id, training_data = False):
    name = "video_comments_" + video_id + ".csv"
    if name in os.listdir():
        print("You are using video that you previously passed, fetching previous comments.")
        df = pd.read_csv(name)
        return df
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
    api_service_name = "youtube"
    api_version = "v3"
    API_KEY = "API_KEY"

    youtube = googleapiclient.discovery.build(api_service_name, api_version, developerKey = API_KEY)

    request = youtube.commentThreads().list(
        part="id, snippet",
        maxResults=1000,
        order="relevance",
        videoId= video_id
    )
    response = request.execute()
    
    return create_dataframe(response, video_id, training_data)

In [None]:
def save_model(model):
    with open('youtube_comment_analyzer.pkl', 'wb') as model_file:
        pickle.dump(model, model_file)

In [None]:
def get_random_video_ids(count = 10):
    # import random
    # API_KEY = 'AIzaSyByf6YnjVaPP1FWII2lO0AnRrzgT9X_Uk8'
    # random = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(3))
    # urlData = "https://www.googleapis.com/youtube/v3/search?key={}&maxResults={}&part=snippet&type=video&q={}".format(API_KEY,count,random)
    # webURL = urllib.request.urlopen(urlData)
    # data = webURL.read()
    # encoding = webURL.info().get_content_charset('utf-8')
    # results = json.loads(data.decode(encoding))
    
    video_ids = ["09k7EUnx5sE", "rW5zJgsZZuk", "gAfYT6Qz_14", "HwLK9dBQn0g", "HgiiY9TLtX8", "eeHLyNFOXM4", "oSLz-iw_Oy4", "AC908sfmPao", "lJCUC0mRkPo", "lkDBImBAmN0"]

    # for data in results['items']:
    #     videoId = (data['id']['videoId'])
    #     video_ids.append(videoId)
    return video_ids

In [None]:
def get_all_comments(video_ids):
    comments = pd.DataFrame(columns = ['Comments', 'stop_comments', 'polarity', 'pol_cat'])

    for video_id in video_ids:
        comments = pd.concat([comments, get_comments(video_id, training_data = True)])
    comments.to_csv("comments.csv")
    print("File saved successfully as comments.csv")

In [None]:
def create_model():
    video_ids = get_random_video_ids(10)
    comments = "comments.csv"
    if comments not in os.listdir():
        print("Comments.csv not found.")
        print("Fetching random comments for training model.")
        comments = get_all_comments(video_ids)
    else:
        print("Comments.csv found")
        print("Reading comments.csv please wait!")
        comments = pd.read_csv(comments)
# create_model()

    X_train,X_test,y_train,y_test = train_test_split(comments['stop_comments'], comments['pol_cat'], test_size = 0.2)
    # vect = CountVectorizer()
    tf_train = vect.fit_transform(X_train)
    tf_test = vect.transform(X_test)
    lr = LogisticRegression()
    lr.fit(tf_train,y_train)
    print("Accuracy: ", int(lr.score(tf_train, y_train)*100), "%")
    save_model(lr)
    return

In [None]:
def analyze_video(video_link=None):
    clear = lambda: os.system('cls')
    clear()
    if video_link is None:
        print("Please provide a valid video Id.")
        print("Exiting ....")
        return None
    video_id = get_videoId(video_link)
    if video_id is None:
        return None
    
    comments = get_comments(video_id)

    model = "youtube_comment_analyzer.pkl"
    if model not in os.listdir():
        print("Model is not found. Creating a new model. Please wait ...")
        create_model()
        print("Model created successully.")
    else:
        print("Found existing model, trying to work on that.")

    with open(model , 'rb') as f:
        model = pickle.load(f)
    # vect = CountVectorizer()
    v = vect.transform(comments["stop_comments"])
    predicted = model.predict(v)

    count_positive = list(predicted).count(1)
    count_negative = list(predicted).count(-1)

    review = ""
    if count_positive > count_negative:
        review = "This youtube video have positive reviews"
    elif count_positive == count_negative:
        review = "This youtube video has average reviews. It is nither bad not good"
    else:
        review = "This youtube video have negative reviews"
    
    total = count_positive + count_negative
    star_value = total // 5

    if count_positive > star_value and count_positive < (2*star_value):
        print(review + " " + "1/5 star.")
    elif count_positive > (2*star_value) and count_positive < (3*star_value):
        print(review + " " + "2/5 star.")
    elif count_positive > (3*star_value) and count_positive < (4*star_value):
        print(review + " " + "3/5 star.")
    elif count_positive > (4*star_value) and count_positive < (5*star_value):
        print(review + " " + "4/5 star.")
    else:
        print(review + " " + "5/5 star.")
    # print(predicted)

In [None]:
analyze_video("https://www.youtube.com/watch?v=09k7EUnx5sE")

You are using video that you previously passed, fetching previous comments.
Model is not found. Creating a new model. Please wait ...
Comments.csv found
Reading comments.csv please wait!
Accuracy:  98 %
Model created successully.
This youtube video have positive reviews 4/5 star.
