In [1]:
import string
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import *
import pandas as pd
import numpy as np
import time
import math
import json
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import KeyedVectors

from nltk.metrics.scores import accuracy
from sklearn import metrics
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def read_dataset(type):
  file_id = ''
  if type == 'movies':
    file_id = '1P1q00qlD26wggaJ8oEOFwmeY0q5aulKE'
  elif type == 'tvshows':
    file_id = '1NZ8hDyYj6Z_fTa9R0pZebgYa_V0kRbnE'
  else:
    file_id = ''
  dataset_link = f"https://drive.google.com/uc?id={file_id}"
  df = pd.read_csv(dataset_link)
  return df

In [3]:
tvshows_df = read_dataset('tvshows')

In [4]:
tvshows_df.shape

(28488, 4)

In [5]:
tvshows_df

Unnamed: 0.1,Unnamed: 0,name,tags,poster
0,1,Clerks,The continuing adventures of store clerks Dant...,https://www.themoviedb.org/t/p/w600_and_h900_b...
1,3,Mister Rogers' Neighborhood,Mister Rogers' Neighborhood is an American chi...,https://www.themoviedb.org/t/p/w600_and_h900_b...
2,5,W*A*L*T*E*R,W*A*L*T*E*R is a pilot for a spin-off of M*A*S...,https://www.themoviedb.org/t/p/w600_and_h900_b...
3,6,Star Wars: Droids,An animated television series that features th...,https://www.themoviedb.org/t/p/w600_and_h900_b...
4,8,"Mary Hartman, Mary Hartman","In the fictional town of Fernwood, Ohio, subur...",https://www.themoviedb.org/t/p/w600_and_h900_b...
...,...,...,...,...
28483,152932,Real Life: Types of Among Us Players,Real Life: Types of Among Us Players Is an Eng...,https://www.themoviedb.org/t/p/w600_and_h900_b...
28484,152936,Not Just Flatmates,,https://www.themoviedb.org/t/p/w600_and_h900_b...
28485,152940,DisGraced,Five years after her big break turned out to b...,https://www.themoviedb.org/t/p/w600_and_h900_b...
28486,152944,Vermem Seni Ellere,"The story of Zelish, the daughter of mountains...",https://www.themoviedb.org/t/p/w600_and_h900_b...


# **Data Prepocessing**

* **get_tokens** function takes a text input, converts it to **lowercase**, **removes punctuation**, **tokenizes** the text into words, **filters** out common English **stopwords**, and finally **applies stemming** using the Porter Stemmer algorithm to reduce words to their base form. The resulting stemmed words are then joined into a single string and returned as the output.

In [13]:
mask = tvshows_df['tags'] == float('nan')

# select all rows except the ones that contain 'Coca Cola'
tvshows_df = tvshows_df[~mask]
tvshows_df = tvshows_df.drop(tvshows_df[tvshows_df['tags'] == float('nan')].index)
tvshows_df = tvshows_df.dropna(subset=['tags'])

In [14]:
tvshows_df.shape

(20856, 4)

In [15]:
tvshows_df.to_csv('tvshows_cleaned.csv', index=False)

In [6]:
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def text_to_bert_embedding(doc_text):
    input_ids = tokenizer.encode(doc_text, add_special_tokens=True, return_tensors='pt', padding=True, truncation=True)
    outputs = bert_model(input_ids)
    hidden_states = outputs.last_hidden_state
    return torch.mean(hidden_states, dim=1).detach().numpy().flatten()

In [7]:
def feature_engineering(docs_text):
    doc_embeddings = pd.DataFrame([text_to_bert_embedding(doc_text) for i, doc_text in tqdm(enumerate(docs_text), total=len(docs_text))])
    return doc_embeddings

In [16]:
tvshows_bert_embeddings = feature_engineering(tvshows_df['tags'].to_list())

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 76%|███████▌  | 15858/20856 [1:42:41<40:45,  2.04it/s][A
 76%|███████▌  | 15859/20856 [1:42:41<36:47,  2.26it/s][A
 76%|███████▌  | 15860/20856 [1:42:41<32:29,  2.56it/s][A
 76%|███████▌  | 15861/20856 [1:42:41<27:52,  2.99it/s][A
 76%|███████▌  | 15862/20856 [1:42:42<26:33,  3.13it/s][A
 76%|███████▌  | 15863/20856 [1:42:42<30:53,  2.69it/s][A
 76%|███████▌  | 15864/20856 [1:42:42<27:11,  3.06it/s][A
 76%|███████▌  | 15865/20856 [1:42:43<23:34,  3.53it/s][A
 76%|███████▌  | 15866/20856 [1:42:43<24:06,  3.45it/s][A
 76%|███████▌  | 15867/20856 [1:42:43<23:57,  3.47it/s][A
 76%|███████▌  | 15868/20856 [1:42:43<21:34,  3.85it/s][A
 76%|███████▌  | 15869/20856 [1:42:44<19:31,  4.26it/s][A
 76%|███████▌  | 15870/20856 [1:42:44<18:49,  4.42it/s][A
 76%|███████▌  | 15871/20856 [1:42:44<18:03,  4.60it/s][A
 76%|███████▌  | 15872/20856 [1:42:44<21:42,  3.83it/s][A
 76%|███████▌  | 15873/20856 [1:42:45<29:27,  2.82

In [17]:
tvshows_bert_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.087331,0.419734,0.056382,0.070040,0.648161,0.128289,0.121720,0.267226,0.070031,-0.220482,...,-0.383057,0.062693,-0.264854,-0.092413,0.009798,-0.226764,-0.016018,0.099179,0.355595,-0.252705
1,-0.328636,-0.076656,0.290349,0.020783,0.036315,-0.063607,0.144467,0.501027,-0.052084,-0.126428,...,-0.447327,-0.082828,-0.071865,-0.231180,-0.153278,-0.526609,0.151461,0.080031,0.407731,0.439653
2,-0.209836,-0.157292,0.269528,-0.025812,0.308780,-0.138422,0.372737,0.162006,0.018057,-0.140695,...,-0.309675,0.051114,-0.081580,-0.313702,-0.155146,-0.296207,-0.080201,0.154436,0.267924,-0.034441
3,-0.094208,-0.121506,0.169558,-0.037827,0.244384,-0.241304,-0.033331,0.073675,-0.155332,-0.247409,...,-0.177010,0.212754,0.130639,-0.154967,0.034417,-0.395917,0.196183,0.171646,0.364287,-0.025220
4,-0.023305,0.153108,0.231063,-0.030689,0.478269,0.181243,0.165903,0.362150,0.023144,0.000042,...,-0.359194,0.077012,-0.176956,-0.098667,-0.107270,-0.539396,-0.007140,-0.116446,0.311512,-0.126025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20851,-0.160246,0.137208,-0.016290,-0.152186,0.275413,0.312372,0.152122,0.360403,-0.144894,-0.244306,...,0.258676,-0.220688,0.179841,-0.062466,-0.071341,-0.500509,-0.015971,-0.346283,0.153187,0.104327
20852,0.042637,-0.140579,0.282006,-0.223974,0.172639,-0.187026,0.102889,0.304147,-0.052746,0.289289,...,0.004898,0.089264,-0.169524,-0.262434,-0.252975,-0.100464,-0.123116,-0.066653,0.297749,0.130462
20853,0.034371,-0.242357,0.273353,-0.062046,0.303164,0.146001,0.379784,0.318054,-0.038579,-0.042274,...,-0.125509,-0.028452,-0.010400,-0.135066,-0.260275,-0.319233,-0.191563,-0.038501,0.119025,0.095241
20854,-0.118993,0.108762,0.123584,-0.049025,0.450167,0.116127,0.008436,0.310773,-0.129460,-0.404996,...,0.295492,-0.400414,-0.065531,-0.311635,-0.343667,-0.180505,-0.144323,-0.553164,0.284836,-0.050805


In [18]:
tvshows_bert_embeddings.to_csv('tvshows_bert_embeddings_45000.csv', index=False)


In [19]:
# Save to HDF5
tvshows_bert_embeddings.to_hdf('tvshows_bert_embeddings_dataframe.h5', key='df', mode='w')


In [15]:
# # Convert column names to strings
# movie_bert_embeddings.columns = movie_bert_embeddings.columns.astype(str)

# # Save to Parquet
# movie_bert_embeddings.to_parquet('movie_bert_embeddings_dataframe.parquet')