### This notebook contains the following operations
* Load the US Video data with Captions
* Create iterators for train, validation and test datasets
* Run the analysis with neural network models (including GRU and Linear NN)
* Run the analysis with Machine Learning Models (logistic regression and random forest) with reduced TFIDF matrix

In [1]:
import data_input as data_in
import nnmodels as nnm
import mlmodels as mlm
import bow_models as bowm
import train_eval
import visualization as vis
from torchtext import data
import torch
import numpy as np
import pandas as pd
import io
import csv
import sys
from torchtext import data

The model using captions with RNN models requires at least 15GB for memroy (either in cpu or gpu)

### Importing Captions Data & and Create Iterators for train, validation and test dataset

In [2]:
data_dir = r'D:\Researching Data\Youtube data\caption_sector\transcripts.txt' # should specify the directory for the captions
path = r'D:\Researching Data\Youtube data\caption_sector' # should specify the path to captions data
TRAIN_VALID_TEST_R = (0.4, 0.4, 0.2)
MAX_VOCAB_SIZE = 25000
BATCH_SIZE = 32 
# using 64 for batch size is okay for LSTM and GRU, but the memory explodes sometimes
# the datasets are splitted randomly, using 32 for batch size is more stable

In [3]:
csv.field_size_limit(1000000000)
txt_list = []
with open(data_dir, "r", encoding="utf-8") as f:
    csv_reader = csv.reader(f, delimiter='\n')
    for row in csv_reader:
        txt_list.append(', '.join(row))

video_id = []
txt_content = []
for txt_row in txt_list:
    video_id.append(txt_row[:11])
    txt_content.append(txt_row[12:])

In [4]:
fdata = pd.read_csv(r'D:\Researching Data\Youtube data\USvideos.csv') # should specify the directory for US video data
new_arr = fdata.drop_duplicates("video_id", "first")[["video_id", "category_id"]]

In [5]:
new_TEXT = txt_content
new_id = video_id

torch.backends.cudnn.deterministic = True
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [6]:
torch.backends.cudnn.deterministic = True
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [7]:
train_indices, valid_indices, test_indices = data_in.split_train_test(len(new_TEXT), TRAIN_VALID_TEST_R)
new_idtxt = pd.DataFrame(list(zip(new_id, new_TEXT)), columns=["video_id", "text"])

The size of train, valid and test data are 1642 1642 822


In [8]:
new_pd = pd.merge(new_arr, new_idtxt, left_on="video_id", right_on="video_id")[["text", "category_id"]]
new_pd.loc[new_pd["category_id"] != 25, "category_id"] = 0
new_pd.loc[new_pd["category_id"] == 25, "category_id"] = 1

The baseline precision is:

In [9]:
new_pd[new_pd["category_id"] == 1].shape[0] / new_pd.shape[0]

0.07355090112031173

In [10]:
new_pd.iloc[train_indices].to_csv(path + "\\train.csv", header=None, index=None)
new_pd.iloc[valid_indices].to_csv(path + "\\valid.csv", header=None, index=None)
new_pd.iloc[test_indices].to_csv(path + "\\test.csv", header=None, index=None)

In [11]:
fields = [("text", TEXT), ("label", LABEL)]
train_data, valid_data, test_data = data.TabularDataset.splits(
                                            path = path,
                                            train = 'train.csv',
                                            validation = 'valid.csv',
                                            test = 'test.csv',
                                            format = 'csv',
                                            fields = fields,
                                            skip_header = True)

In [12]:
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)
device = torch.device('cpu') # switch to the local device
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data_in.build_iterator(BATCH_SIZE, device, train_data, valid_data, test_data)

### Testing Neural Network Models (GRU) for Captions only

In [13]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 50
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model_wordem = nnm.WordEmbAvg_2linear(INPUT_DIM, EMBEDDING_DIM, 
                                      HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)
model_rnn = nnm.SimpleRNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, 
                          OUTPUT_DIM, PAD_IDX)
model_BLSTM = nnm.LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, 
                       N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
model_GRU = nnm.GRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, 
                       N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
model_CNN = nnm.CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, 
                    OUTPUT_DIM, DROPOUT, PAD_IDX)
#MODEL_DICT = {"avg_embedding": model_wordem, "SimpleRNN": model_rnn,
#              "BLSTM": model_BLSTM, "CNN": model_CNN}
MODEL_DICT = {"avg_embedding": model_wordem, "GRU": model_GRU}

In [14]:
#result for label 24
best_models, models_perf = train_eval.compare_models(MODEL_DICT, device, train_iterator, valid_iterator, test_iterator)

currently training the model:  avg_embedding
Epoch 0: Dev Accuracy: 0.7538060901256708 Dev Loss:0.49483970953867984
Epoch 1: Dev Accuracy: 0.7817841882889087 Dev Loss:0.5008824453330957
Epoch 2: Dev Accuracy: 0.7788461538461539 Dev Loss:0.5635954078573447
Epoch 3: Dev Accuracy: 0.7821180556829159 Dev Loss:0.8920228398190095
Epoch 4: Dev Accuracy: 0.7749065172213775 Dev Loss:0.9194676373153925
Test Loss: 0.866 | Test Acc: 77.46%
Test Prec: nan% | Test Rec: 40.055%
currently training the model:  GRU
Epoch 0: Dev Accuracy: 0.7183493593564401 Dev Loss:0.5994919120119169
Epoch 1: Dev Accuracy: 0.7315705132025939 Dev Loss:0.5822257044223639
Epoch 2: Dev Accuracy: 0.7045272439718246 Dev Loss:0.5928595094726636
Epoch 3: Dev Accuracy: 0.6955795941444544 Dev Loss:0.6032525925682142
Epoch 4: Dev Accuracy: 0.6898370729042933 Dev Loss:0.6088535711169243
Test Loss: 0.608 | Test Acc: 71.25%
Test Prec: nan% | Test Rec: 5.980%


In [14]:
#result for label 25
best_models, models_perf = train_eval.compare_models(MODEL_DICT, device, train_iterator, valid_iterator, test_iterator)

currently training the model:  avg_embedding
Epoch 0: Dev Accuracy: 0.9250801285872092 Dev Loss:0.19852247060491487
Epoch 1: Dev Accuracy: 0.9449786326059928 Dev Loss:0.16923019692946512
Epoch 2: Dev Accuracy: 0.9473824787598389 Dev Loss:0.19230335514293984
Epoch 3: Dev Accuracy: 0.9491853633752236 Dev Loss:0.21848319830986349
Epoch 4: Dev Accuracy: 0.942841880596601 Dev Loss:0.2495448182682874
Test Loss: 0.160 | Test Acc: 95.55%
Test Prec: nan% | Test Rec: nan%
currently training the model:  GRU
Epoch 0: Dev Accuracy: 0.9172676285872092 Dev Loss:0.2661041310773446
Epoch 1: Dev Accuracy: 0.9073183765778174 Dev Loss:0.2658712441244951
Epoch 2: Dev Accuracy: 0.9133279919624329 Dev Loss:0.27174548127760106
Epoch 3: Dev Accuracy: 0.8667868593564401 Dev Loss:0.34031229487691933
Epoch 4: Dev Accuracy: 0.9112580132025939 Dev Loss:0.3148644603162001
Test Loss: 0.250 | Test Acc: 92.36%
Test Prec: nan% | Test Rec: nan%


### Testing TFIDF & Machine Learning Models with Captions only

In [10]:
import tokenization_dim_reduction as tdr
new_arr = np.array(new_pd["text"])
txt_tfidf = tdr.tfidf_tokenization(new_arr)
new_TEXT = txt_tfidf.toarray()
new_label = np.array(new_pd["category_id"])
top5k_indices = np.argsort(np.apply_along_axis(np.mean, 0, new_TEXT))[-5000:]
new_TEXT = new_TEXT[:, top5k_indices]

In [11]:
train_indices, valid_indices, test_indices = data_in.split_train_test(new_TEXT.shape[0], (0.4,0.4,0.2))    
X_train, X_valid = tdr.dimensional_reduction(new_TEXT[train_indices], 500, True, new_TEXT[valid_indices])
X_train, X_test = tdr.dimensional_reduction(new_TEXT[train_indices], 500, True, new_TEXT[test_indices])

The size of train, valid and test data are 1642 1642 822


In [12]:
y_train = new_label[train_indices].reshape([len(train_indices),1])
y_valid = new_label[valid_indices].reshape([len(valid_indices),1])
y_test = new_label[test_indices].reshape([len(test_indices),1])

In [13]:
mlm.ml_evaluate(mlm.CLFS, mlm.PARAMETER_DICT, X_train, y_train, 
                              X_valid, y_valid, X_test, y_test)

operation of random_forest method begins
the best accuracy for method:  random_forest  is  0.9440389294403893
the corresponding precision is :  0.0
the corresponding recall is :  0.0
operation of logistics method begins
the best accuracy for method:  logistics  is  0.9464720194647201
the corresponding precision is :  0
the corresponding recall is :  0.0


{'random_forest': ("random_forest with parameters : {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 1, 'n_jobs': 2, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}",
  0.9440389294403893,
  0,
  0.0),
 'logistics': ("logistics with parameters : {'C': 0.001, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}",
  0.9464720194647201,
  0,
  0.0)}