### This notebook contains the following operations
* Cosine Similarities using caption data (political category as the reference)
* Cosine Similarities using titles/Tags/Descriptions (political category as the reference)
* Bottom-up Clustering (based on cosine similarities of political category)
* Top 100 words with the highest TFIDF (Using all the text in a category as a single document)

In [1]:
import data_input as data_in
from torchtext import data
import torch.nn as nn
import torch
import numpy as np
import pandas as pd
import csv
import tokenization_dim_reduction as tdr
import json
import sklearn.feature_extraction.text as FE
import cosine_similarity as cs

### Cosine Similarities Using BOW and TFIDF (with captions data only)

In [2]:
data_dir = r'D:\Researching Data\Youtube data\caption_sector\transcripts.txt' # should specify the directory for captions data
f_dir = r'D:\Researching Data\Youtube data\USvideos.csv' # should specify the directory for US video data
path = r'D:\Researching Data\Youtube data\caption_sector' # should specify the path to captions data
dic_dir = r'D:\Researching Data\Youtube data\US_category_id.json' # should specify the directory for category dictionary
new_pd = data_in.import_captions(data_dir, f_dir)

#### Keep 5000 words with highest average TFIDF

In [3]:
cos = nn.CosineSimilarity(dim=0)
cat_dict, cat_size, similarities = cs.similarity_bow(dic_dir, new_pd, cos, 25, 5000)
tfidf_similarities = cs.similarity_tfidf(cat_dict, cat_size, new_pd, cos, 25, (1,5000))

In [4]:
for key, v in tfidf_similarities.items():
    if v != 0:
        print((key, v))

('Film & Animation', 0.9234253018578943)
('Autos & Vehicles', 0.8596320332841434)
('Music', 0.805913052784863)
('Pets & Animals', 0.8535342113621643)
('Sports', 0.8917769266441121)
('Travel & Events', 0.8395866581830274)
('Gaming', 0.8933234372714954)
('People & Blogs', 0.9120624030160931)
('Entertainment', 0.9051104056612778)
('News & Politics', 1.0)
('Howto & Style', 0.8683970472377324)
('Education', 0.942261301087541)
('Science & Technology', 0.9155296280589654)
('Nonprofits & Activism', 0.9073148039840575)
('Shows', 0.6845312364989041)


#### Keep 200th to 5000th words with highest average TFIDF

In [5]:
tfidf_similarities200 = cs.similarity_tfidf(cat_dict, cat_size, new_pd, cos, 25, (200,5000))

In [6]:
for key, v in tfidf_similarities200.items():
    if v != 0:
        print((key, v))

('Film & Animation', 0.6294881350645263)
('Autos & Vehicles', 0.35779364993058804)
('Music', 0.4612295281677221)
('Pets & Animals', 0.3794832124254326)
('Sports', 0.49052620607731984)
('Travel & Events', 0.34756632241114227)
('Gaming', 0.46449314411059356)
('People & Blogs', 0.6537143412613922)
('Entertainment', 0.6764452210997878)
('News & Politics', 1.0)
('Howto & Style', 0.469008127524843)
('Education', 0.6358827292804715)
('Science & Technology', 0.5691234152760964)
('Nonprofits & Activism', 0.4850951616051855)
('Shows', 0.20272080135732284)


### Cosine Similarities Using BOW and TFIDF (with titles/tags/descriptions data)

In [7]:
data_dir = r'D:\Researching Data\Youtube data\USvideos.csv' # should specify the directory for US video data
f_set = data_in.FEATURE_GROUPS["full"]
_, dtext, dlabel = tdr.select_col(data_dir, f_set[0])
new_TEXT = tdr.combine_text(dtext, f_set[1], f_set[2])
new_pd = pd.DataFrame(np.concatenate([new_TEXT.reshape(len(dlabel), 1), dlabel.reshape(len(dlabel), 1)], axis=1), 
                      columns=["text", "category_id"])

#### Keep 5000 words with highest average TFIDF

In [8]:
cos = nn.CosineSimilarity(dim=0)
cat_dict, cat_size, similarities = cs.similarity_bow(dic_dir, new_pd, cos, 25, 5000)
tfidf_similarities = cs.similarity_tfidf(cat_dict, cat_size, new_pd, cos, 25, (1,5000))

In [9]:
for key, v in tfidf_similarities.items():
    if v != 0:
        print((key, v))

('Film & Animation', 0.512862920625565)
('Autos & Vehicles', 0.46232427289474015)
('Music', 0.42666892963291714)
('Pets & Animals', 0.39380803210259435)
('Sports', 0.3934525414099961)
('Travel & Events', 0.4675199117742903)
('Gaming', 0.3684639278611043)
('People & Blogs', 0.5503564019532635)
('Entertainment', 0.622015614923866)
('News & Politics', 1.0)
('Howto & Style', 0.4676337916685023)
('Education', 0.512298456462439)
('Science & Technology', 0.5486932782662686)
('Nonprofits & Activism', 0.42036971767562775)
('Shows', 0.19088822157667848)


#### Keep 200th to 5000th words with highest average TFIDF

In [10]:
tfidf_similarities200 = cs.similarity_tfidf(cat_dict, cat_size, new_pd, cos, 25, (200,5000))

In [11]:
for key, v in tfidf_similarities200.items():
    if v != 0:
        print((key, v))

('Film & Animation', 0.21705513150498237)
('Autos & Vehicles', 0.15813954243273223)
('Music', 0.16853987657741312)
('Pets & Animals', 0.14948760178409445)
('Sports', 0.1899150826889674)
('Travel & Events', 0.18086203773634008)
('Gaming', 0.08817142464370153)
('People & Blogs', 0.2639444399785001)
('Entertainment', 0.3503764220440238)
('News & Politics', 1.0)
('Howto & Style', 0.18807561251352187)
('Education', 0.23143236289352687)
('Science & Technology', 0.25835242483072896)
('Nonprofits & Activism', 0.18685896529477283)
('Shows', 0.07535072911685735)


### Bottom-Up Clustering (with titles/tags/descriptions data)

In [12]:
TFIDF_RANGE = (1, 5000)
POL_ID = 25

In [13]:
pol_node = cs.bottom_up(cat_dict, cat_size, new_pd, TFIDF_RANGE)

In [14]:
current_node = pol_node
while current_node.childrens != []:
    print("category index in the group:")
    print(current_node.childrens[0].categories)
    print("cosine similarity of (quasi) Politics group v.s new category (", current_node.childrens[1].name ,"):")
    print(current_node.childrens[0].best_similarity)
    print(" ")
    current_node = current_node.childrens[0]

category index in the group:
[25, 24, 22, 1, 28, 26, 27, 10, 19, 2, 20, 15, 29, 17]
cosine similarity of (quasi) Politics group v.s new category ( Shows ):
0.27486730359495337
 
category index in the group:
[25, 24, 22, 1, 28, 26, 27, 10, 19, 2, 20, 15, 29]
cosine similarity of (quasi) Politics group v.s new category ( Sports ):
0.4877946815956316
 
category index in the group:
[25, 24, 22, 1, 28, 26, 27, 10, 19, 2, 20, 15]
cosine similarity of (quasi) Politics group v.s new category ( Nonprofits & Activism ):
0.49198099068944484
 
category index in the group:
[25, 24, 22, 1, 28, 26, 27, 10, 19, 2, 20]
cosine similarity of (quasi) Politics group v.s new category ( Pets & Animals ):
0.5298349633592451
 
category index in the group:
[25, 24, 22, 1, 28, 26, 27, 10, 19, 2]
cosine similarity of (quasi) Politics group v.s new category ( Gaming ):
0.5336692613005063
 
category index in the group:
[25, 24, 22, 1, 28, 26, 27, 10, 19]
cosine similarity of (quasi) Politics group v.s new category 

### Top 100 words with the highest TFIDF (all documents in each categories are merged into single document)
#### The new TFIDF has size (number of categories) * (number of unique words)

In [2]:
data_dir = r'D:\Researching Data\Youtube data\USvideos.csv' # should specify the directory for US video data
f_set = data_in.FEATURE_GROUPS["full"]
_, dtext, dlabel = tdr.select_col(data_dir, f_set[0])
new_TEXT = tdr.combine_text(dtext, f_set[1], f_set[2])
new_pd = pd.DataFrame(np.concatenate([new_TEXT.reshape(len(dlabel), 1), dlabel.reshape(len(dlabel), 1)], axis=1), 
                      columns=["text", "category_id"])

In [3]:
topk_dict = cs.get_topk_words(new_pd, 100)

In [4]:
topk_dict

{22: array(['week', 'asis', 'carlin', 'motion', 'ncredits', 'use', 'camera',
        'great', 'corden', 'nbuzzfeed', 'hair', 'best', 'provided', 'time',
        'nbuzzfeedvideo', 'nget', 'nwatch', 'beauty', 'nsnapchat', 'style',
        'ijustine', 'family', 'doctor', 'bee', 'riverdale', 'box',
        'nstills', 'awesome', 'nthe', 'nfacebook', 'greatbigstory', 'vs',
        'grace', 'christmas', 'list', 'ytbuzzfeedblue1', 'ytbuzzfeedvideo',
        'ytbuzzfeedviolet', 'foxs', 'tumblr', 'connorfranta', 'life',
        'nvia', 'frontal', 'story', 'audio', 'shareable', 'vlog', 'curid',
        '2017', 'cbs', '2018', 'world', 'madelaine', 'nmusic', 'vlogs',
        'nmy', 'nerdfighteria', 'like', 'boldly', 'nsfx', 'love', 'make',
        'buzzfeedviolet', 'daily', 'ninstagram', 'nextbeat', 'food',
        'perolike', 'helbig', 'try', 'late', 'refinery29', 'day',
        'ntwitter', 'gracehelbig', 'funny', 'ladylike', 'bfmp',
        'buzzfeedblue', 'fashion', 'makeup', 'nhttp', 'channel',