In [1]:
import numpy as np
import pandas as pd
import re
from collections import Counter
from functionized_code.data_pipeline import DEFAULT_KWARGS, get_labels_and_corpus, __make_vocab, __make_index
from run_pipeline import LABEL_TO_IDX, get_data, get_column_indices

IDX_TO_LABEL = {i: l for l, i in LABEL_TO_IDX.items()}

In [2]:
# Set parameters for vocabulary.
MAX_VOCAB = 25000
NGRAMS = 1
IDX_TARGET = LABEL_TO_IDX['News & Politics']

# Identify the names of the columns for the labels and corpus.
col_labels = 'category_id'
col_corpus = ['title', 'tags', 'description', 'caption']

In [3]:
# Write a function to compare vocabularies.
def compare_vocab(labels, corpus, i, n, k, **kwargs):
    target_corpus = corpus[labels == i]
    target_vocab = __make_vocab(target_corpus, n, **kwargs)
    target_vocab = __make_index(target_vocab, k)
    target_vocab = set(target_vocab)
    target_len = len(target_vocab)
    # Report size of target vocabulary.
    print('%25s: \t%5d words' % (IDX_TO_LABEL[i], target_len))
    # Compare to intersections of vocabulary with other categories.
    for j in np.unique(labels):
        # Skip self.
        if j == i:
            continue
        # Get vocabulary for this other category.
        other_corpus = corpus[labels == j]
        other_vocab = __make_vocab(other_corpus, n, **kwargs)
        other_vocab = __make_index(other_vocab, k)
        other_vocab = set(other_vocab)
        other_len = len(other_vocab)
        # Count the number of words at the intersection.
        intersn = len(target_vocab & other_vocab)
        percent = intersn / target_len
        # Report size of and intersection with other vocabulary.
        print('%25s: \t%5d words \t%5d (%3.2f) shared' % (IDX_TO_LABEL[j], other_len, intersn, percent))
    # Compare with all other categories together.
    rest_corpus = corpus[labels != i]
    rest_vocab = __make_vocab(rest_corpus, n, **kwargs)
    rest_vocab = __make_index(rest_vocab, k)
    rest_vocab = set(rest_vocab)
    rest_len = len(rest_vocab)
    # Count the number of words at the intersection.
    intersn = len(target_vocab & rest_vocab)
    percent = intersn / target_len
    # Report size of and intersection with other vocabulary.
    print('%25s: \t%5d words \t%5d (%3.2f) shared' % ('Corpus', rest_len, intersn, percent))

### Vocabulary with captions

In [4]:
# Read the video data with captions.
data_w_captions = get_data(captions=True)
# Get the labels and corpus.
idx_labels, idx_corpus = get_column_indices(data_w_captions, col_labels, col_corpus)
labels_w_captions, corpus_w_captions = get_labels_and_corpus(
    data_w_captions, idx_labels, idx_corpus, silent=True)

In [5]:
# Count the videos by category.
data_w_captions['category_id'] \
    .value_counts() \
    .reset_index() \
    .replace({'index': IDX_TO_LABEL})

Unnamed: 0,index,category_id
0,Entertainment,1149
1,Howto & Style,515
2,Comedy,444
3,People & Blogs,354
4,News & Politics,302
5,Science & Technology,300
6,Music,235
7,Education,228
8,Film & Animation,212
9,Sports,165


In [6]:
# Compare intersections of the vocabulary between the target and other categories.
print('Intersection of vocabularies with captions')
compare_vocab(labels_w_captions, corpus_w_captions, IDX_TARGET, NGRAMS, MAX_VOCAB, **DEFAULT_KWARGS)

Intersection of vocabularies with captions
          News & Politics: 	22650 words
         Film & Animation: 	18525 words 	 9028 (0.40) shared
         Autos & Vehicles: 	 5363 words 	 3652 (0.16) shared
                    Music: 	12385 words 	 6294 (0.28) shared
           Pets & Animals: 	 6781 words 	 4265 (0.19) shared
                   Sports: 	12707 words 	 6920 (0.31) shared
          Travel & Events: 	 5786 words 	 3506 (0.15) shared
                   Gaming: 	 8685 words 	 5371 (0.24) shared
           People & Blogs: 	23392 words 	10175 (0.45) shared
                   Comedy: 	20800 words 	 9219 (0.41) shared
            Entertainment: 	25002 words 	11432 (0.50) shared
            Howto & Style: 	25002 words 	 9869 (0.44) shared
                Education: 	20491 words 	 9493 (0.42) shared
     Science & Technology: 	21897 words 	 9745 (0.43) shared
    Nonprofits & Activism: 	 3837 words 	 3045 (0.13) shared
                    Shows: 	  950 words 	  744 (0.03) shared
  

### Vocabulary without captions

In [7]:
# Read the video data without captions.
data_wo_captions = get_data(captions=False)
# Get the labels and corpus.
idx_labels, idx_corpus = get_column_indices(data_wo_captions, col_labels, col_corpus[:3])
labels_wo_captions, corpus_wo_captions = get_labels_and_corpus(
    data_wo_captions, idx_labels, idx_corpus, silent=True)

In [8]:
# Count the videos by category.
data_wo_captions['category_id'] \
    .value_counts() \
    .reset_index() \
    .replace({'index': IDX_TO_LABEL})

Unnamed: 0,index,category_id
0,Entertainment,1619
1,Music,799
2,Howto & Style,595
3,Comedy,547
4,News & Politics,505
5,People & Blogs,498
6,Sports,451
7,Science & Technology,380
8,Film & Animation,318
9,Education,250


In [9]:
# Compare intersections of the vocabulary between the target and other categories.
print('Intersection of vocabularies without captions')
compare_vocab(labels_wo_captions, corpus_wo_captions, IDX_TARGET, NGRAMS, MAX_VOCAB, **DEFAULT_KWARGS)

Intersection of vocabularies without captions
          News & Politics: 	 8278 words
         Film & Animation: 	 8056 words 	 2636 (0.32) shared
         Autos & Vehicles: 	 2246 words 	 1108 (0.13) shared
                    Music: 	11270 words 	 2821 (0.34) shared
           Pets & Animals: 	 3512 words 	 1540 (0.19) shared
                   Sports: 	 5951 words 	 2253 (0.27) shared
          Travel & Events: 	 2230 words 	 1080 (0.13) shared
                   Gaming: 	 2528 words 	 1126 (0.14) shared
           People & Blogs: 	10187 words 	 3135 (0.38) shared
                   Comedy: 	 8517 words 	 2856 (0.35) shared
            Entertainment: 	20509 words 	 4514 (0.55) shared
            Howto & Style: 	12119 words 	 3054 (0.37) shared
                Education: 	 7853 words 	 2574 (0.31) shared
     Science & Technology: 	 8579 words 	 2856 (0.35) shared
    Nonprofits & Activism: 	  908 words 	  514 (0.06) shared
                    Shows: 	  221 words 	  129 (0.02) shared