In [7]:
import numpy as np 
import pandas as pd 
import os
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
%matplotlib notebook

import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline


from lime.lime_text import LimeTextExplainer
from tqdm import tqdm
import string
import random
import operator
import seaborn as sns
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from statistics import *
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
import nltk


# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)

import warnings
warnings.filterwarnings("ignore")

# **EDA/Data Cleaning**

In [8]:
#list of data that we have in the workspace

print(os.listdir("../input"))

In [9]:
# countries that use English as an official language
british_youtube = pd.read_csv("../input/GBvideos.csv")
canadian_youtube = pd.read_csv("../input/CAvideos.csv")
us_youtube = pd.read_csv("../input/USvideos.csv")


In [10]:
canadian_youtube.columns

First of all, let's take a look at data :)

In [11]:
canadian_youtube.head()

In [12]:
british_youtube.head()

In [13]:
us_youtube.head()

In [14]:
#combine tables
three_countries=pd.concat([canadian_youtube, british_youtube,us_youtube])
three_countries.shape


In [15]:
#Check duplicate. It is always good to check whether there are some duplicates in dataset!
three_countries.video_id.value_counts()[:10]

In [16]:
#remove duplicate
three_countries= three_countries.drop_duplicates(['video_id'], keep='first')


In [17]:
three_countries.video_id.value_counts()[:10]

In [18]:
#need to be decoded 
three_countries.category_id.head()

In [19]:
import json

def category_name(path):
    with open(path) as json_file:  
        data = json.load(json_file)
    category_info_list=[]
    for row in data['items']:
        id_info=row['id']
        category_name=row['snippet']['title']
        categoty_info=(id_info ,category_name)
        category_info_list.append(categoty_info)
    return(dict(category_info_list))
        
    

In [20]:
category_name("../input/CA_category_id.json")

In [21]:
category_list=category_name("../input/CA_category_id.json")
category_names=[]
for i in three_countries.category_id:
    category_name=category_list.get(str(i))
    category_names.append(category_name)

three_countries['category_names']=category_names

In [22]:
#now, we have category name :)
three_countries['category_names'].head()

In [23]:
three_countries.info()

# **Finding Outlier Youtube Video**

The dataset already includes trending Youtube video. However, I am interested in taking a look at popular ones among trending videos. I will use number of view as the variable to define the popularity of video. In the analysis, a popular video means the number of view for the video is more than 1.5 interquartile ranges (IQRs) above the third quartile. 

In [24]:
#give you a report of distribution of data/correlation
import pandas_profiling as pp

pp.ProfileReport(three_countries[['views','likes','dislikes','comment_count']])

In [25]:

Q1 = three_countries.views.quantile(0.25)
Q3 = three_countries.views.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

popular_videos=three_countries.loc[three_countries.views > (Q3 + 1.5 * IQR)]

three_countries['popular']=0
three_countries.loc[three_countries.views > (Q3 + 1.5 * IQR),'popular']=1

three_countries['popular'].value_counts()

In [26]:
#make a variable that tells ratio of like and dislike
three_countries['like_percentage']=(three_countries['likes']/(three_countries['likes']+three_countries['dislikes'])*100)
#date column as datatime datatype
three_countries["publish_time"] = pd.to_datetime(three_countries["publish_time"])

In [27]:
#top 20 channels by the mean of views

fig = plt.figure(figsize=(15, 10))
three_countries.groupby('channel_title').mean().sort_values(by=['views'], ascending=False)[:20].views.sort_values(ascending=True).plot(kind='barh',colormap='winter',fontsize=20)


<br> <br>

**Among popular video, here are top 10 like percentage video. All of them are k-pop stars MV lol** 

 
<table><tr>
<td> <img src="https://i.ytimg.com/vi/PMEkmiQP5bg/default.jpg" alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/v9ea5VDQfXg/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/c5_LROaHGtw/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/3-FXW0CW_8o/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/-7tSTUR7FG0/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/J41qe-TM1DY/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/nQySbNGu4g0/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/Q48VduIflPk/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/floMqK_yHf8/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/VM-g_bkFdzo/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
</tr></table>



In [28]:
#among popular videos, here are top 10 like percentage video. All of them are k-pop stars MV lol 

three_countries.loc[three_countries.popular==1].sort_values(by=['like_percentage'], ascending=False)[:10]

<br> <br>

<center>**NO.1 like percentage video among popular videos in the dataset! Congrat to Bangtan :)) A decent song to listen while coding 👩‍💻**</center> <br>
<center><iframe width="560" height="315" src="https://www.youtube.com/embed/PMEkmiQP5bg" frameborder="0" allowfullscreen></iframe></center>



<br> <br>

**Among popular video, here are top 10 dislike percentage video** 


<table><tr>
<td> <img src="https://i.ytimg.com/vi/V5cOvyDpWfM/default.jpg" alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/qu-biRtYEcU/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/Aqx41JrNTSw/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/C-rumHvmqCA/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/ivYp5NMaUY4/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/xZZyckBhCmY/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/zhUmo88gzwg/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/CMA2iF6RuXk/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/wJJqGh2HLM8/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
<td> <img src="https://i.ytimg.com/vi/eT9eWtb7C4c/default.jpg"  alt="Drawing" style="width: 700px;"/> </td>
</tr></table>


In [29]:
#among popular videos, here are top 10 dislike percentage video. 

three_countries.loc[three_countries.popular==1].sort_values(by=['like_percentage'], ascending=True)[:10]

<br><br>

<center>**No.1 dislike percentage video among popular videos in the dataset..No offense Fergie fans out there 🙃**</center> <br>
<center><iframe width="560" height="315" src="https://www.youtube.com/embed/V5cOvyDpWfM" frameborder="0" allowfullscreen></iframe></center>





# **Closer look on Titles of Popular Videos**

[spaCy](https://spacy.io/usage/spacy-101) is a free, open-source library for advanced Natural Language Processing (NLP) in Python. 

> "Tokens" are usually individual words (at least in languages like English) and "tokenization" is taking a text or set of text and breaking it up into individual its words. These tokens are then used as the input for other types of analysis or tasks, like parsing (automatically tagging the syntactic relationship between words). We need to tokenize word so that we can use it for our title generating model/other cool analysis ([source](https://www.kaggle.com/rtatman/tokenization-tutorial))

In [30]:
#simple example 

nlp = spacy.load("en_core_web_sm")
doc = nlp(u"An apple is not a banana")
for token in doc:
    print(token.text)


<table>
<tr><td>0 </td><td> 1 </td> <td>2 </td><td>3 </td><td>4 </td> <td>5 </td></tr>
<tr><td>An </td> <td>  apple</td><td>  is</td><td>not </td> <td>  a</td><td> banana</td></tr>
</table>

In [31]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)
parser = English()

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens


tqdm.pandas()

normal = three_countries["title"][three_countries["popular"] == 0].progress_apply(spacy_tokenizer)
popular = three_countries["title"][three_countries["popular"] == 1].progress_apply(spacy_tokenizer)

In [32]:
#tokenize words by popularity 

def word_generator(text):
    word = list(text.split())
    return word
def bigram_generator(text):
    bgram = list(nltk.bigrams(text.split()))
    bgram = [' '.join((a, b)) for (a, b) in bgram]
    return bgram
def trigram_generator(text):
    tgram = list(nltk.trigrams(text.split()))
    tgram = [' '.join((a, b, c)) for (a, b, c) in tgram]
    return tgram


normal_words = normal.progress_apply(word_generator)
popular_words = popular.progress_apply(word_generator)
normal_bigrams = normal.progress_apply(bigram_generator)
popular_bigrams = popular.progress_apply(bigram_generator)
normal_trigrams = normal.progress_apply(trigram_generator)
popular_trigrams = popular.progress_apply(trigram_generator)

In [33]:
#function that makes a pretty word frequency plot

def word_plot(words,my_color):
    slist =[]
    for x in words:
        slist.extend(x)
    fig = plt.figure(figsize=(15, 10))
    pd.Series(slist).value_counts()[:20].sort_values(ascending=True).plot(kind='barh',fontsize=20, color=my_color)
    plt.show()


In [34]:
word_plot(popular_words,'blue')


In [35]:
word_plot(popular_bigrams,'orange')


In [36]:
word_plot(popular_trigrams,'red')

#### For example, the word "banana" appears all documents , so its idf is the lowest

### TfidfVectorizer

Tf-idf analyzes the impact of tokens (words) throughout the whole documents. For example, the more times a word appears in a document (each title), the more weight it will have. However, the more documents (titles) the word appears in, it is 'penalized' and the weight is diminished because it is empirically less informative than features that occur in a small fraction of the training corpus ([source](https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments))

* tf(t)= the term frequency is the number of times the term appears in the document
* idf(d, t) = the document frequency is the number of documents 'd' that contain term 't'

In [37]:
txt1 = ['I like banana', 'An apple is not a banana', 'banana banana oh banana']
tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
txt_fitted = tf.fit(txt1)
txt_transformed = txt_fitted.transform(txt1)
print ("The text: ", txt1)

In [38]:
tf.vocabulary_

In [39]:
idf = tf.idf_
print(dict(zip(txt_fitted.get_feature_names(), idf)))
print("\nThe token 'banana' appears 5 times but it is also in all documents, so its idf is the lowest")

In [40]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(three_countries.title)
word_features = word_vectorizer.transform(three_countries.title)

classifier_popular = LogisticRegression(C=0.1, solver='sag')
classifier_popular.fit(word_features ,three_countries.popular)


In [41]:
names=['normal','popular']

In [42]:
c_tf = make_pipeline( word_vectorizer,classifier_popular)
explainer_tf = LimeTextExplainer(class_names=names)

exp = explainer_tf.explain_instance(three_countries.title.iloc[10], c_tf.predict_proba, num_features=4, top_labels=1)
exp.show_in_notebook(text=three_countries.title.iloc[10])


In [43]:
exp = explainer_tf.explain_instance(three_countries.title.iloc[4], c_tf.predict_proba, num_features=5, top_labels=1)
exp.show_in_notebook(text=three_countries.title.iloc[4])

In [44]:
exp = explainer_tf.explain_instance(three_countries.title.iloc[10035], c_tf.predict_proba, num_features=5, top_labels=1)
exp.show_in_notebook(text=three_countries.title.iloc[10035])

In [45]:
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tls


labels = list(three_countries.category_names.value_counts().index.values)
values = list(three_countries.category_names.value_counts().values)

trace = go.Pie(labels=labels, values=values)

iplot([trace], filename='basic_pie_chart')

In [46]:
three_countries.groupby('category_names')['views'].describe()

## Latent Dirichlet Allocation (LDA) by Category

In [47]:
entertainment_title= three_countries["title"][(three_countries['category_names'] == 'Entertainment')] 
news_politics_title= three_countries["title"][(three_countries['category_names'] == 'News & Politics')] 
people_title= three_countries["title"][(three_countries['category_names'] == 'People & Blogs')] 
music_title= three_countries["title"][(three_countries['category_names'] == 'Music')] 
sports_title= three_countries["title"][(three_countries['category_names'] == 'Sports')] 
comedy_title= three_countries["title"][(three_countries['category_names'] == 'Comedy')] 

In [48]:
vectorizer_entertainment_title = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
entertainment_title_vectorized = vectorizer_entertainment_title.fit_transform(entertainment_title)
lda_popular_entertainment_title_vectorized = LatentDirichletAllocation(n_components=7, max_iter=5, learning_method='online',verbose=True)
entertainment_title_vectorized_lda = lda_popular_entertainment_title_vectorized.fit_transform(entertainment_title_vectorized )

pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_popular_entertainment_title_vectorized,entertainment_title_vectorized, vectorizer_entertainment_title, mds='tsne')
dash

In [49]:
vectorizer_news_politics_title = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
news_politics_title_vectorized = vectorizer_news_politics_title.fit_transform(news_politics_title)
lda_news_politics_title_vectorized= LatentDirichletAllocation(n_components=7, max_iter=5, learning_method='online',verbose=True)
news_politics_title_vectorized_lda = lda_news_politics_title_vectorized.fit_transform(news_politics_title_vectorized )

pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_news_politics_title_vectorized,news_politics_title_vectorized, vectorizer_news_politics_title , mds='tsne')
dash

In [50]:
vectorizer_people_title = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
people_title_vectorized = vectorizer_people_title.fit_transform(people_title)
lda_people_title_vectorized= LatentDirichletAllocation(n_components=7, max_iter=5, learning_method='online',verbose=True)
people_title_vectorized_lda = lda_people_title_vectorized.fit_transform(people_title_vectorized )

pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_people_title_vectorized,people_title_vectorized, vectorizer_people_title , mds='tsne')
dash

In [51]:
vectorizer_music_title = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
music_title_vectorized = vectorizer_music_title.fit_transform(music_title)
lda_music_title_vectorized= LatentDirichletAllocation(n_components=7, max_iter=5, learning_method='online',verbose=True)
music_title_vectorized_lda = lda_music_title_vectorized.fit_transform(music_title_vectorized )

pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_music_title_vectorized,music_title_vectorized, vectorizer_music_title , mds='tsne')
dash

In [52]:
vectorizer_sports_title = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
sports_title_vectorized = vectorizer_sports_title.fit_transform(sports_title)
lda_sports_title_vectorized= LatentDirichletAllocation(n_components=7, max_iter=5, learning_method='online',verbose=True)
sports_title_vectorized_lda = lda_sports_title_vectorized.fit_transform(sports_title_vectorized )

pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_sports_title_vectorized,sports_title_vectorized, vectorizer_sports_title , mds='tsne')
dash

In [53]:
vectorizer_comedy_title = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
comedy_title_vectorized = vectorizer_comedy_title.fit_transform(comedy_title)
lda_comedy_title_vectorized= LatentDirichletAllocation(n_components=7, max_iter=5, learning_method='online',verbose=True)
comedy_title_vectorized_lda = lda_comedy_title_vectorized.fit_transform(comedy_title_vectorized )

pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_comedy_title_vectorized,comedy_title_vectorized, vectorizer_comedy_title , mds='tsne')
dash

<br><br>

# Generating titles by lstm

> Language Modelling is the core problem for a number of of natural language processing tasks such as speech to text, conversational system, and text summarization. A trained language model learns the likelihood of occurrence of a word based on the previous sequence of words used in the text. Language models can be operated at character level, n-gram level, sentence level or even paragraph level. In this notebook, I will explain how to create a language model for generating natural language text by implement and training state-of-the-art Recurrent Neural Network. ([source](https://medium.com/phrasee/neural-text-generation-generating-text-using-conditional-language-models-a37b69c7cd4b))
<br> <br>
Language modelling requires a sequence input data, as given a sequence (of words/tokens) the aim is the predict next word.  



![](http://www.shivambansal.com/blog/text-lstm/2.png)

Unlike Feed-forward neural networks in which activation outputs are propagated only in one direction, the activation outputs from neurons propagate in both directions (from inputs to outputs and from outputs to inputs) in Recurrent Neural Networks. This creates loops in the neural network architecture which acts as a ‘memory state’ of the neurons. This state allows the neurons an ability to remember what have been learned so far.

The memory state in RNNs gives an advantage over traditional neural networks but a problem called Vanishing Gradient is associated with them. In this problem, while learning with a large number of layers, it becomes really hard for the network to learn and tune the parameters of the earlier layers. To address this problem, A new type of RNNs called LSTMs (Long Short Term Memory) Models have been developed.

LSTMs have an additional state called ‘cell state’ through which the network makes adjustments in the information flow. The advantage of this state is that the model can remember or forget the leanings more selectively. To learn more about LSTMs, here is a great post. Lets architecture a LSTM model in our code. I have added total three layers in the model.

Input Layer : Takes the sequence of words as input
LSTM Layer : Computes the output using LSTM units. I have added 100 units in the layer, but this number can be fine tuned later.
Dropout Layer : A regularisation layer which randomly turns-off the activations of some neurons in the LSTM layer. It helps in preventing over fitting. (Optional Layer)
Output Layer : Computes the probability of the best possible next word as output
We will run this model for total 50 epoochs but it can be experimented further.

[source](http://www.shivambansal.com/blog/text-lstm/2.png)

In [54]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(popular)
inp_sequences[:10]

In [55]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [56]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

In [57]:
model.fit(predictors, label, epochs=5, verbose=5)


In [58]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [59]:
print (generate_text("Drake", 5, model, max_sequence_len))
print (generate_text("united states", 5, model, max_sequence_len))
print (generate_text("Bangtan", 4, model, max_sequence_len))
print (generate_text("Fergie", 4, model, max_sequence_len))
print (generate_text("korea", 4, model, max_sequence_len))
print (generate_text("Minnesota", 4, model, max_sequence_len))