In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation, bert_embeddings_from_list
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

In [3]:
import re
import pandas as pd
import numpy as np
import string
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import nltk
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [4]:
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
# stop_words.update(('\r', '\n', '\r\n', '\n\r'))

In [6]:
imdb = pd.read_csv("/home/lab08/imdb_review.csv")
mydrama = pd.read_csv('/home/lab08/my_dramalist_crawling.csv')

In [7]:
mydrama = mydrama.drop(['Unnamed: 0.1'], axis=1)
mydrama

Unnamed: 0.1,Unnamed: 0,genre,review
0,0,comedy,This drama was such a delight and surprise to ...
1,1,comedy,So this beautiful Romcom has finally come to a...
2,2,comedy,"""Encounter"" is % romance, with no filler, so i..."
3,3,comedy,This drama is a treat for romance drama lovers...
4,4,comedy,This review may contain spoilers\nIt will touc...
...,...,...,...
8786,8786,action,"Season II, YESSSSS!\nI love Lee Seung Gi and S..."
8787,8787,action,This review may contain spoilers\nOne of the b...
8788,8788,action,This review may contain spoilers\nTHIS REVIEW ...
8789,8789,action,Its my first review. There are many good kdram...


In [25]:
df = pd.concat([imdb, mydrama])
df['genre'].unique()

array(['drama', 'crime', 'comedy', 'thriller', 'romance', 'action',
       'fantasy'], dtype=object)

In [10]:
drama_review = (df['genre'] =='drama')
thriller_review = (df['genre'] == 'thriller')
comedy_review = (df['genre'] == 'comedy')

df_drama_review = df[drama_review]['review']
df_thriller_review = df[thriller_review]['review']
df_comedy_review = df[comedy_review]['review']

In [11]:
sentence = df_comedy_review.to_list()

In [12]:
sent = ",".join(sentence)

In [13]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('V'):
        return 'v'
    elif pos_tag.startswith('N'):
        return 'n'
    elif pos_tag.startswith('J'):
        return 'a'
    elif pos_tag.startswith('R'):
        return 'r'
    else:
        return None

In [14]:
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, a):
        a = ''.join(a)
        word_tokens = self.tagger(a)
        
        words = []

        for i in word_tokens:
            text = re.sub('[^a-zA-Z0-9\']','',i).strip()
            text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`…》]','', text)
            if(text != ''):
                words.append(text)
        
        tag_words = nltk.pos_tag(words)
        pos_words = [word for word in tag_words if word[1][0] in {'V','N','J','R'}]
        # pos_words = [word if word[1].startswith('V') else word if word[1].startswith('N') else word if word[1].startswith('J') else word if word[1].startswith('R') else word for word in tag_words]
        temp_list = []
        for token, pos_tag in pos_words:
            tag = get_wordnet_pos(pos_tag)
            if tag != None:
                temp_list.append((token, get_wordnet_pos(pos_tag)))
        lemma = WordNetLemmatizer()
        token_final = [lemma.lemmatize(token, pos=tag) for token, tag in temp_list]
        long_words = [i for i in token_final if len(i) > 2]
        results = [w for w in long_words if w not in stop_words]
        return results

In [15]:
custom_tokenizer = CustomTokenizer(text_to_word_sequence)

In [16]:
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, max_features=3000)

In [17]:
train_bow_embeddings = vectorizer.fit_transform(sentence)

In [18]:
vocab = vectorizer.get_feature_names()
id2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}

In [19]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation, bert_embeddings_from_list
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

In [20]:
train_contextualized_embeddings = bert_embeddings_from_list(sentence, "sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens")

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [21]:
qt = TopicModelDataPreparation()

training_dataset = qt.load(train_contextualized_embeddings, train_bow_embeddings, id2token)

In [22]:
ctm = CombinedTM(bow_size=len(vocab), contextual_size=768, n_components=5, num_epochs=20)
ctm.fit(training_dataset)

Epoch: [20/20]	 Seen Samples: [76440/76440]	Train Loss: 513.3550911049516	Time: 0:00:00.678479: : 20it [00:14,  1.42it/s]


In [23]:
ctm.get_topics(5)

defaultdict(list,
            {0: ['aggressive',
              'diagnosis',
              'heartbreak',
              'critique',
              'nuance'],
             1: ['make', 'character', 'show', 'well', 'love'],
             2: ['aggressive',
              'diagnosis',
              'heartbreak',
              'critique',
              'nuance'],
             3: ['read', 'review', 'watch', 'chemistry', 'storyline'],
             4: ['aggressive',
              'diagnosis',
              'heartbreak',
              'nuance',
              'critique']})

In [24]:
import pyLDAvis as vis

lda_vis_data = ctm.get_ldavis_data_format(vocab, training_dataset, n_samples=10)

ctm_pd = vis.prepare(**lda_vis_data)
vis.display(ctm_pd)

Sampling: [10/10]: : 10it [00:07,  1.29it/s]
