# YouTube Comments Sentiment Analysis 

### Import packages

In [121]:
# Basics
import pandas as pd; import os
import csv; import numpy as np
import re; import warnings
warnings.filterwarnings('ignore')

## Reading Data

### Reading Testing YouTube Video Comments

data.csv files has comments of youtube video -- https://www.youtube.com/watch?v=kfVsfOSbJY0

In [122]:
prediction_comments = pd.read_csv('Comments.csv', delimiter=",", encoding='utf-8', engine='python')
prediction_comments = prediction_comments.iloc[:,:1]
prediction_comments.columns=['comment']
prediction_comments.head()

Unnamed: 0,comment
0,Get Fact Right: In India School have asked stu...
1,All these laws seem to be created to suppress ...
2,This will change something because its the fir...
3,How is it same in case of India??? Did the sta...
4,Agreed. Keep the fight alive ladies. Remember...


### Reading Pre-Labeled YouTube Video Comments

here we are taking pre-labeled comments of 5 popular youtube videos for training & testing

In [123]:
# training data
okgo = pd.read_csv('OKGO.csv', delimiter=";", skiprows=2, encoding='latin-1', engine='python') # read in the data
trump = pd.read_csv('trump.csv', delimiter=",", skiprows=2, encoding='utf-8', error_bad_lines=False, engine='python')
swift = pd.read_csv('TaylorSwift.csv', delimiter=",", skiprows=2, nrows=180, encoding='utf-8', engine='python')
royal = pd.read_csv('RoyalWedding.csv', delimiter=",", skiprows=2, nrows=61, encoding='utf-8', engine='python')
paul = pd.read_csv('LoganPaul.csv', delimiter=",", skiprows=2, nrows=200, encoding='utf-8', engine='python')

### Reading Pre-Labeled Tweets & Blog Comments

In [124]:
blogs = pd.read_csv('Kagel.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data
tweets = pd.read_csv('twitter.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data

## Data Preprocessing

In [125]:
# clean dataframes
tweets = tweets.drop(['Topic', 'TweetId', "TweetDate"], axis = 1).dropna()
tweets.head()

Unnamed: 0,Sentiment,TweetText
0,positive,Now all @Apple has to do is get swype on the i...
1,positive,@Apple will be adding more carrier support to ...
2,positive,Hilarious @youtube video - guy does a duet wit...
3,positive,@RIM you made it too easy for me to switch to ...
4,positive,I just realized that the reason I got into twi...


In [126]:
def fix_cols(DF):
    DF = DF.iloc[:,:2]
    DF.columns = ["label", "comment"]
    return DF

In [127]:
okgo = fix_cols(okgo)
trump = fix_cols(trump)
swift = fix_cols(swift)
royal = fix_cols(royal)
paul = fix_cols(paul)
tweets = fix_cols(tweets)


okgo.head()

Unnamed: 0,label,comment
0,-1.0,Everyone knows brand's papers from.\rBut -No o...
1,0.0,ÒYour paper cut balance is: \r-£25279102771Ó
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE.........
3,1.0,Blowing my mind yet again
4,0.0,Should have gone with Dunder Mifflin


In [128]:
tweets.label = tweets.label.replace({'positive': '1.0', 'negative':'-1.0', 'neutral': '0.0', 'irrelevant': '0.0'}, regex=True)
tweets['label'] = pd.to_numeric(tweets['label'], errors='coerce')

In [129]:
tweets = fix_cols(tweets)
blogs = fix_cols(blogs)

tweets.head()

Unnamed: 0,label,comment
0,1.0,Now all @Apple has to do is get swype on the i...
1,1.0,@Apple will be adding more carrier support to ...
2,1.0,Hilarious @youtube video - guy does a duet wit...
3,1.0,@RIM you made it too easy for me to switch to ...
4,1.0,I just realized that the reason I got into twi...


### Create Datasets

In [130]:
yt_comments = pd.concat([okgo, trump, swift, royal, paul], ignore_index=True)
yt_comments.head()

Unnamed: 0,label,comment
0,-1.0,Everyone knows brand's papers from.\rBut -No o...
1,0.0,ÒYour paper cut balance is: \r-£25279102771Ó
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE.........
3,1.0,Blowing my mind yet again
4,0.0,Should have gone with Dunder Mifflin


In [131]:
non_yt_comments = pd.concat([blogs, tweets], ignore_index=True)
non_yt_comments.head()

Unnamed: 0,label,comment
0,1.0,i liked the Da Vinci Code a lot
1,1.0,i liked the Da Vinci Code a lot
2,1.0,I liked the Da Vinci Code but it ultimatly di...
3,1.0,that's not even an exaggeration ) and at midn...
4,1.0,I loved the Da Vinci Code but now I want some...


In [132]:
comments = pd.concat([yt_comments, non_yt_comments], ignore_index=True)
comments.head()

Unnamed: 0,label,comment
0,-1.0,Everyone knows brand's papers from.\rBut -No o...
1,0.0,ÒYour paper cut balance is: \r-£25279102771Ó
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE.........
3,1.0,Blowing my mind yet again
4,0.0,Should have gone with Dunder Mifflin


### Remove Non-Alphabetic Characters (including numbers)

In [133]:
def convert_to_string(DF):
    DF["comment"]= DF["comment"].astype(str) 

In [134]:
convert_to_string(comments)

In [135]:
def cleanerFn(b):
    # keeps only words with alphabetic characters in comments
    for row in range(len(b)):
        line = b.loc[row, "comment"]
        b.loc[row,"comment"] = re.sub("[^a-zA-Z]", " ", line)

In [136]:
cleanerFn(comments)
comments.head()

Unnamed: 0,label,comment
0,-1.0,Everyone knows brand s papers from But No on...
1,0.0,Your paper cut balance is
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...
3,1.0,Blowing my mind yet again
4,0.0,Should have gone with Dunder Mifflin


### Natural Language Processing

In [137]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [138]:
sw = stopwords.words('english')
ps = PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

#### Tokenization, Remove Stop Words, Lemmatization & Stemming

In [139]:
def nlpFunction(DF):
    DF['com_token'] = DF['comment'].str.lower().str.split()
    DF['com_remv'] = DF['com_token'].apply(lambda x: [y for y in x if y not in sw])
    DF["com_lemma"] = DF['com_remv'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x]) # lemmatization
    DF['com_stem'] = DF['com_lemma'].apply(lambda x : [ps.stem(y) for y in x]) # stemming
    DF["com_tok_str"] = DF["com_stem"].apply(', '.join)
    DF["com_full"] = DF["com_remv"].apply(' '.join)
    return DF

In [140]:
comments = nlpFunction(comments)
comments.head()

Unnamed: 0,label,comment,com_token,com_remv,com_lemma,com_stem,com_tok_str,com_full
0,-1.0,Everyone knows brand s papers from But No on...,"[everyone, knows, brand, s, papers, from, but,...","[everyone, knows, brand, papers, one, knows, w...","[everyone, know, brand, paper, one, know, welf...","[everyon, know, brand, paper, one, know, welfa...","everyon, know, brand, paper, one, know, welfar...",everyone knows brand papers one knows welfare ...
1,0.0,Your paper cut balance is,"[your, paper, cut, balance, is]","[paper, cut, balance]","[paper, cut, balance]","[paper, cut, balanc]","paper, cut, balanc",paper cut balance
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...,"[oh, shit, when, i, saw, this, on, my, front, ...","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]","oh, shit, saw, front, page, love, song",oh shit saw front page love song
3,1.0,Blowing my mind yet again,"[blowing, my, mind, yet, again]","[blowing, mind, yet]","[blowing, mind, yet]","[blow, mind, yet]","blow, mind, yet",blowing mind yet
4,0.0,Should have gone with Dunder Mifflin,"[should, have, gone, with, dunder, mifflin]","[gone, dunder, mifflin]","[gone, dunder, mifflin]","[gone, dunder, mifflin]","gone, dunder, mifflin",gone dunder mifflin


In [141]:
def drop_cols_after_nlp(comments):
    comments = comments.drop(columns = ['comment', 'com_token', 'com_remv', 'com_lemma', 'com_stem', 'com_tok_str'], axis = 1)
    return comments
comments = drop_cols_after_nlp(comments)
comments.head()

Unnamed: 0,label,com_full
0,-1.0,everyone knows brand papers one knows welfare ...
1,0.0,paper cut balance
2,1.0,oh shit saw front page love song
3,1.0,blowing mind yet
4,0.0,gone dunder mifflin


In [142]:
comments.rename(columns = {'com_full': 'comment'}, inplace=True)
comments.head()

Unnamed: 0,label,comment
0,-1.0,everyone knows brand papers one knows welfare ...
1,0.0,paper cut balance
2,1.0,oh shit saw front page love song
3,1.0,blowing mind yet
4,0.0,gone dunder mifflin


In [143]:
def remove_missing_vals(comments): 
    comments['comment'] = comments['comment'].str.strip()
    comments = comments[comments.comment != 'nan'] # remove nan values from data
    comments = comments[comments.comment != '']
    
remove_missing_vals(comments)

In [144]:
comments.head()

Unnamed: 0,label,comment
0,-1.0,everyone knows brand papers one knows welfare ...
1,0.0,paper cut balance
2,1.0,oh shit saw front page love song
3,1.0,blowing mind yet
4,0.0,gone dunder mifflin


In [145]:
comments['label'].isna().sum()

2355

In [146]:
comments = comments[comments['label'].notna()]
comments['label'].isna().sum()

0

In [147]:
len(comments)

14830

In [148]:
X = comments['comment']
y = comments.label

In [149]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=53, test_size=0.25)

### Vectorize the tweets
<p>We have the training and testing data all set up, but we need to create vectorized representations of the tweets in order to apply machine learning.</p>
<p>To do so, we will utilize the <code>CountVectorizer</code> and <code>TfidfVectorizer</code> classes which we will first need to fit to the data.</p>
<p>Once this is complete, we can start modeling with the new vectorized tweets!</p>

In [150]:
# Initialize count vectorizer
count_vectorizer = CountVectorizer(stop_words='english', 
                                   min_df=0.05, max_df=0.9)

# Create count train and test variables
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

# Initialize tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', 
                                   min_df=0.05, max_df=0.9)

# Create tfidf train and test variables
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

## Model Building

In [156]:
# Set seed for reproducibility
import random; random.seed(5)

# Import all we need from sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics

### Multinomial Naive-Bayes Model
Training a multinomial naive Bayes model
<p>Now that we have the data in vectorized form, we can train the first model. Investigate using the Multinomial Naive Bayes model with both the <code>CountVectorizer</code> and <code>TfidfVectorizer</code> data. Which do will perform better? How come?</p>
<p>To assess the accuracies, we will print the test sets accuracy scores for both models.</p>

In [157]:
# Create a MulitnomialNB model
tfidf_nb = MultinomialNB()
tfidf_nb.fit(tfidf_train,y_train)
# Run predict on your TF-IDF test data to get your predictions
tfidf_nb_pred = tfidf_nb.predict(tfidf_test)

# Calculate the accuracy of your predictions
tfidf_nb_score = metrics.accuracy_score(y_test,tfidf_nb_pred)

# Create a MulitnomialNB model
count_nb = MultinomialNB()
count_nb.fit(count_train,y_train)

# Run predict on your count test data to get your predictions
count_nb_pred = count_nb.predict(count_test)

# Calculate the accuracy of your predictions
count_nb_score = metrics.accuracy_score(count_nb_pred,y_test)

print('NaiveBayes Tfidf Score: ', tfidf_nb_score)
print('NaiveBayes Count Score: ', count_nb_score)

NaiveBayes Tfidf Score:  0.7909924487594391
NaiveBayes Count Score:  0.7831715210355987


### SVC

In [162]:
# Create a SVM model
tfidf_svc = svm.SVC(kernel='poly', degree=3, C=1, decision_function_shape='ovo')

tfidf_svc.fit(tfidf_train,y_train)
# Run predict on your tfidf test data to get your predictions
tfidf_svc_pred = tfidf_svc.predict(tfidf_test)

# Calculate your accuracy using the metrics module
tfidf_svc_score = metrics.accuracy_score(y_test,tfidf_svc_pred)

print("LinearSVC Score:   %0.3f" % tfidf_svc_score)

LinearSVC Score:   0.798


In [160]:
X_train[10]

'waste ink paper'

In [155]:
sorted(tfidf_svc_pred)

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [98]:
# Lets use SVC to predict on our youtube video comments
prediction_comments.head()

Unnamed: 0,comment
0,Get Fact Right: In India School have asked stu...
1,All these laws seem to be created to suppress ...
2,This will change something because its the fir...
3,How is it same in case of India??? Did the sta...
4,Agreed. Keep the fight alive ladies. Remember...


In [99]:
len(prediction_comments['comment'])

201

In [100]:
convert_to_string(prediction_comments)
cleanerFn(prediction_comments)
prediction_comments = nlpFunction(prediction_comments)
prediction_comments = drop_cols_after_nlp(prediction_comments)
prediction_comments.head()

Unnamed: 0,com_full
0,get fact right india school asked students wea...
1,laws seem created suppress women shame women t...
2,change something first time persians kurds rai...
3,case india state karnataka govt passed law uni...
4,agreed keep fight alive ladies remember win st...


In [101]:
prediction_comments.rename(columns = {'com_full': 'comment'}, inplace=True)

In [102]:
remove_missing_vals(prediction_comments)
prediction_comments.head()

Unnamed: 0,comment
0,get fact right india school asked students wea...
1,laws seem created suppress women shame women t...
2,change something first time persians kurds rai...
3,case india state karnataka govt passed law uni...
4,agreed keep fight alive ladies remember win st...


In [103]:
tfidf_pred = tfidf_vectorizer.transform(prediction_comments['comment'])
tfidf_svc_pred = tfidf_svc.predict(tfidf_pred)

In [109]:
neutral = (tfidf_svc_pred == 0.0).sum()
positive = (tfidf_svc_pred == 1.0).sum()
negative = (tfidf_svc_pred < 0).sum()

len(tfidf_svc_pred)

201

In [110]:
print(neutral, positive, negative)

196 5 0


In [106]:
print("Good video" if positive > negative else "Bad video")

Good video
