### This notebook contains the following operations
* Load the US Video data with Captions
* Create iterators for train, validation and test datasets
* Run the analysis with neural network models (including simple RNN, LSTM, CNN, and simple Linear NN)

In [1]:
import data_input as data_in
import nnmodels as nnm
import mlmodels as mlm
import bow_models as bowm
import train_eval
import visualization as vis
from torchtext import data
import torch
import numpy as np
import pandas as pd
import io
import csv
import sys
from torchtext import data

### Importing Captions Data and Create Iterators for train, validation and test dataset

In [2]:
data_dir = r'D:\Researching Data\Youtube data\caption_sector\transcripts.txt' # should specify the directory for the captions
path = r'D:\Researching Data\Youtube data\caption_sector' # should specify the path to captions data
TRAIN_VALID_TEST_R = (0.4, 0.4, 0.2)
MAX_VOCAB_SIZE = 25000
BATCH_SIZE = 64

In [3]:
csv.field_size_limit(1000000000)
txt_list = []
with open(data_dir, "r", encoding="utf-8") as f:
    csv_reader = csv.reader(f, delimiter='\n')
    for row in csv_reader:
        txt_list.append(', '.join(row))

video_id = []
txt_content = []
for txt_row in txt_list:
    video_id.append(txt_row[:11])
    txt_content.append(txt_row[12:])

In [4]:
fdata = pd.read_csv(r'D:\Researching Data\Youtube data\USvideos.csv') # should specify the directory for US video data
new_arr = fdata.drop_duplicates("video_id", "first")[["video_id", "category_id"]]

In [5]:
new_TEXT = txt_content
new_id = video_id

torch.backends.cudnn.deterministic = True
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [6]:
train_indices, valid_indices, test_indices = data_in.split_train_test(len(new_TEXT), TRAIN_VALID_TEST_R)
new_idtxt = pd.DataFrame(list(zip(new_id, new_TEXT)), columns=["video_id", "text"])

The size of train, valid and test data are 1642 1642 822


In [7]:
new_pd = pd.merge(new_arr, new_idtxt, left_on="video_id", right_on="video_id")[["text", "category_id"]]
new_pd.loc[new_pd["category_id"] != 25, "category_id"] = 0 # The category label can be replaced here
new_pd.loc[new_pd["category_id"] == 25, "category_id"] = 1 # The category label can be replaced here

The baseline precision is 

In [8]:
new_pd[new_pd["category_id"] == 1].shape[0] / new_pd.shape[0]

0.07355090112031173

In [9]:
new_pd.iloc[train_indices].to_csv(path + "\\train.csv", header=None, index=None)
new_pd.iloc[valid_indices].to_csv(path + "\\valid.csv", header=None, index=None)
new_pd.iloc[test_indices].to_csv(path + "\\test.csv", header=None, index=None)

In [10]:
fields = [("text", TEXT), ("label", LABEL)]
train_data, valid_data, test_data = data.TabularDataset.splits(
                                            path = path,
                                            train = 'train.csv',
                                            validation = 'valid.csv',
                                            test = 'test.csv',
                                            format = 'csv',
                                            fields = fields,
                                            skip_header = True)

In [11]:
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data_in.build_iterator(BATCH_SIZE, device, train_data, valid_data, test_data)

### Testing Neural Network Models for Captions Data 

In [12]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 50
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model_wordem = nnm.WordEmbAvg_2linear(INPUT_DIM, EMBEDDING_DIM, 
                                      HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)
model_rnn = nnm.SimpleRNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, 
                          OUTPUT_DIM, PAD_IDX)
model_BLSTM = nnm.LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, 
                       N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
model_CNN = nnm.CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, 
                    OUTPUT_DIM, DROPOUT, PAD_IDX)
MODEL_DICT = {"avg_embedding": model_wordem, "SimpleRNN": model_rnn,
              "BLSTM": model_BLSTM, "CNN": model_CNN}

#### The result for Politics Category (category id = 25)

In [13]:
best_models, models_perf = train_eval.compare_models(MODEL_DICT, device, train_iterator, valid_iterator, test_iterator)

currently training the model:  avg_embedding
Epoch 0: Dev Accuracy: 0.9316076452915485 Dev Loss:0.25120626791165424
Epoch 1: Dev Accuracy: 0.9427620768547058 Dev Loss:0.18278173414560464
Epoch 2: Dev Accuracy: 0.9440372884273529 Dev Loss:0.16400804112736994
Epoch 3: Dev Accuracy: 0.9533155491718879 Dev Loss:0.18150587990665093
Epoch 4: Dev Accuracy: 0.9499736153162442 Dev Loss:0.26239300222816664
Test Loss: 0.159 | Test Acc: 95.98%
Test Prec: nan% | Test Rec: 47.436%
currently training the model:  SimpleRNN
Epoch 0: Dev Accuracy: 0.9298047606761639 Dev Loss:0.2563474467740609
Epoch 1: Dev Accuracy: 0.9298047606761639 Dev Loss:0.25429064809129787
Epoch 2: Dev Accuracy: 0.9298047606761639 Dev Loss:0.2562699747773317
Epoch 3: Dev Accuracy: 0.9298047606761639 Dev Loss:0.26208389851336295
Epoch 4: Dev Accuracy: 0.9298047606761639 Dev Loss:0.2547185905277729
Test Loss: 0.249 | Test Acc: 93.34%
Test Prec: nan% | Test Rec: 0.000%
currently training the model:  BLSTM
Epoch 0: Dev Accuracy: 0.92

The result for Entertainment Category (category id = 24)

In [13]:
best_models, models_perf = train_eval.compare_models(MODEL_DICT, device, train_iterator, valid_iterator, test_iterator)

currently training the model:  avg_embedding
Epoch 0: Dev Accuracy: 0.7518615149534665 Dev Loss:0.5224246829748154
Epoch 1: Dev Accuracy: 0.7672226784320978 Dev Loss:0.482451863013781
Epoch 2: Dev Accuracy: 0.7877579744045551 Dev Loss:0.4769950635158099
Epoch 3: Dev Accuracy: 0.7789634145223178 Dev Loss:0.5109034180641174
Epoch 4: Dev Accuracy: 0.7662699337189014 Dev Loss:0.6197512837556692
Test Loss: 0.514 | Test Acc: 76.87%
Test Prec: 61.598% | Test Rec: 37.170%
currently training the model:  SimpleRNN
Epoch 0: Dev Accuracy: 0.7209486399705594 Dev Loss:0.6013073577330663
Epoch 1: Dev Accuracy: 0.7209486399705594 Dev Loss:0.6202571392059326
Epoch 2: Dev Accuracy: 0.7239534476628671 Dev Loss:0.5893129373972232
Epoch 3: Dev Accuracy: 0.7239534476628671 Dev Loss:0.5895780806358044
Epoch 4: Dev Accuracy: 0.7245544092013285 Dev Loss:0.5946709009317251
Test Loss: 0.596 | Test Acc: 72.34%
Test Prec: nan% | Test Rec: 0.000%
currently training the model:  BLSTM
Epoch 0: Dev Accuracy: 0.7233524