In [1]:
import pandas as pd
import numpy as np
import re

#### import the packages
- we need spacy for nlp
- we use a matcher to match different spellings of names
- we borrow spacy's pre-trained model: en_core_web_lg
- we'll add entity ruler to the pipeline because we need to label all the Korean names as PERSON (which the model sometimes fail to predict it as it is)
- we import neuralcoref for pronoun replacement (coreference resolution)

In [2]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_lg')
from spacy.pipeline import EntityRuler
ruler = EntityRuler(nlp)

import neuralcoref

In [3]:
dramas = pd.read_csv('reviews.csv', index_col = 0)
dramas

Unnamed: 0,title,review
0,Goblin,"If you are hesitating to watch this at all, pl..."
1,Goblin,I feel compelled to write a review for Goblin ...
2,Goblin,It must be said that Goblin is very much a som...
3,Goblin,Every moment of it indeed shined! Goblin is a ...
4,Goblin,"Somewhere inside Goblin, buried under a mound ..."
...,...,...
21228,Queen: Love And War,I think I might have over-hyped this drama for...
21229,Queen: Love And War,"I'll just gonna say it was very wonderful, it'..."
21230,Queen: Love And War,This is one of my binge watched dramas after b...
21231,Queen: Love And War,"The best word to sum up this kdrama is ""strong..."


#### define functions to remove/convert emojis
- we use the package `emot` which contains dictionaries for emoticons/unicodes of emojis and corresponding descriptionos
- the package has typos ('andry'-->'angry'). So we correct them after we import them

In [4]:
from emot.emo_unicode import UNICODE_EMO, EMOTICONS_EMO
for emo, des in EMOTICONS_EMO.items():
    if ' andry ' in des:
        EMOTICONS_EMO[emo] = EMOTICONS_EMO[emo].replace('andry', 'angry') # the package has typos
        
def remove_emo(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, ' ')
    for emot in EMOTICONS_EMO:
        text = text.replace(emot, ' ')
    return text

def convert_emo(text):
    for emot in UNICODE_EMO:
        description = "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split())
        text = text.replace(emot, ' '+description+' ')
    for emot in EMOTICONS_EMO:
        description = "_".join(EMOTICONS_EMO[emot].replace(",","").split())
        text = text.replace(emot, ' '+description+' ')
    return text

#### define functions to clean punctuations / remove non-ASCII characters
- we replace repeated punctuations (eg. replace two or more '?' with '?' because we want to split by punctuations (eg. single '?') later. If we don't replace then later we'll have many strings of '?' when the reviewers use '??????' in the review
- we don't want to split by comma because we want to keep the full sentence. So we'll replace '...' with ',' and decimals (eg: 7.8) to 7,8


- we also define a function to remove non-ASCII characters

In [5]:
def clean_punct(text):
    text = re.sub(r'\?{2,}', "? ", text) # replace two or more '?' with '? '
    text = re.sub(r'!{2,}', "! ", text)  # replace two or more '!' with '! '
    text = re.sub(r'\.{2,}', ", ", text) # replace '...' with ', '
    
    text = re.sub(r'\s+-\s*', ' ', text) # replace - with ' '
    text = re.sub(r'&', ' and ', text)   # replace & with ' and '

    dec = re.findall(r'\d+\.\d*', text)
    for n in dec:
        m = n.replace('.', ',')
        text = text.replace(n, m)       # replace 7.8 with 7,8
    text = re.sub(r'\s+', ' ', text)
    return text

def clean_ascii(text):
    return str(''.join([t if ord(t) < 128 else ' ' for t in text]))

#### language detections:
- remove emojis first because they'll affect language detections
- if the review is in English, we'll then convert the emojis/emoticons

In [6]:
from langdetect import detect_langs

langlist = []
review_clean = []
for d in range(len(dramas)):
    no_emo = remove_emo(dramas.iloc[d,1])
    no_emo = clean_punct(no_emo)
    try:
        langL = detect_langs(no_emo)# for language detection, remove emojis instead of replacing them first, becuase the English description may affect the detection results
    except:
        langlist.append(d)
        continue
        
    if str(langL[0]).split(':')[0]!='en':
        langlist.append(d)
    else:
        with_emo = convert_emo(dramas.iloc[d,1])
        review = clean_ascii(with_emo)
        review = clean_punct(review)
        review_clean.append(review)
# we don't do clean_ascii first before detecting language because it may affect the language detections

In [7]:
print("number of non-English reviews:", len(langlist), '\n')
for lang in langlist:
    print(dramas.iloc[lang, 1], "\n")

number of non-English reviews: 126 

5 episodios he durado. Todo en esta serie es tan absurdo que no te motiva a seguir viéndola. Lo mejor la música y los actores, pero no consiguen sostener la lamentable trama. 

Os pontos mais altos de Goblin são: a trilha sonora, q é uma das melhores, se não a melhor trilha sonora em Doramas q já escutei na vida, a arte  visual e a fotografia, que são belíssimas, os efeitos especiais, que são incríveis e não deixam em nada a desejar, a incrível atuação da Sunny, que de longe e a melhor personagem do Dorama, ah, o enredo tbm é bem interessante. Agora os pontos negativos e que me incomodaram bastante, alguns furos de roteiro idiotas, a morte mais nada a ver da protagonista, sério eu fiquei de bobeira como aquilo foi forçado e a atuação medíocre do Dong Wook ou "Grim Reaper", um personagem com bastante potencial, mas com um crescimento quase nulo é de é personalidade rasa, sério, não sei o q as meninas pagam tanto pau pra esse cara! 

Troche sie meczye

- drop those non-English reviews

In [8]:
dramas = dramas.drop(langlist).reset_index(drop=True)
dramas = dramas.drop(columns='review')
dramas['review'] = review_clean
dramas
# now reviews should only contain English

Unnamed: 0,title,review
0,Goblin,"If you are hesitating to watch this at all, pl..."
1,Goblin,I feel compelled to write a review for Goblin ...
2,Goblin,It must be said that Goblin is very much a som...
3,Goblin,Every moment of it indeed shined! Goblin is a ...
4,Goblin,"Somewhere inside Goblin, buried under a mound ..."
...,...,...
21102,Queen: Love And War,I think I might have over-hyped this drama for...
21103,Queen: Love And War,"I'll just gonna say it was very wonderful, it'..."
21104,Queen: Love And War,This is one of my binge watched dramas after b...
21105,Queen: Love And War,"The best word to sum up this kdrama is ""strong..."


In [9]:
# read in the role informations
roles = pd.read_csv('roles.csv', index_col = 0)
roles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2138 entries, 0 to 2137
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Title       2138 non-null   object
 1   CharName    2136 non-null   object
 2   Role        2138 non-null   object
 3   MainCouple  2138 non-null   object
 4   ActorName   2138 non-null   object
 5   Gender      2138 non-null   object
 6   Image       2138 non-null   object
dtypes: object(7)
memory usage: 133.6+ KB


In [10]:
# There are NaN for CharName, we fill it with empty string
roles = roles.fillna('')

roleTitle = set(roles['Title'])
dramaTitle = set(dramas['title'])
dramaTitle - roleTitle

{'The Great Seducer'}

- `The Great Seducer` doesn't have role information, so we drop it as well to be consistent (because we can't change actor/actress name to single word format and we won't be able to match them with any tuple pair in the dictionary later)

In [11]:
dramas = dramas[dramas['title']!= 'The Great Seducer'].reset_index(drop=True)
dramas

Unnamed: 0,title,review
0,Goblin,"If you are hesitating to watch this at all, pl..."
1,Goblin,I feel compelled to write a review for Goblin ...
2,Goblin,It must be said that Goblin is very much a som...
3,Goblin,Every moment of it indeed shined! Goblin is a ...
4,Goblin,"Somewhere inside Goblin, buried under a mound ..."
...,...,...
21012,Queen: Love And War,I think I might have over-hyped this drama for...
21013,Queen: Love And War,"I'll just gonna say it was very wonderful, it'..."
21014,Queen: Love And War,This is one of my binge watched dramas after b...
21015,Queen: Love And War,"The best word to sum up this kdrama is ""strong..."


#### Create conversion dictionary for neuralcoref:
- first, the 'CharacterName' field is not well-structured. <br>
eg: `"Safari" Moon Deok Bae`: it is one string but two names, `Safari` or `Moon Deok Bae` <br>
eg: `Park Ha / Boo Yong` or `Jun Seol | Ryu Soo Hyun` : some are seperated by '/', some are seperated by '|' <br>
eg: `"Man Goo" Choi Jin Nyeo {Room 610}` <br>
eg: `Han Mi Mo [former "Angels" member] `<br>
so this function will split the string into individual names
- after having all the possible character names and actor/actress name, if gender is `male`, we will add `['boy','male','man']` to the key and `['girl','female','woman']` otherwise
- this function will return a dictionary for conversion, and also a cast list containing all the names (character names and real names for entity ruler later)

In [12]:
def make_casts(roles, title):
    cast_dict = {}
    df = roles[roles['Title']==title]
    actors = list(df['ActorName'])
    casts = []
    for actor in actors:
        row = df[df['ActorName']==actor]
        index = row.index[0]
        gender = row['Gender'].values[0]
        char = row['CharName'].values[0]
        charNew = []

        if len(row['CharName']) != 0 and 'Himself' not in char and 'Herself' not in char:
        # sometimes the actor is acting as 'Himself/Herself' so if that's in character name, don't include
        
            if '|' in char:
                names = char.split("|")
            elif '/' in char:
                names = char.split("/")
            else:
                names = [char]
            for nam in names:
                nam = nam.strip()
                nam = nam.split('[')[0].strip() 
                # if there is [ ] in the string, we don't include those inside the squre brackets becuase it is 
                # usually an explanation (not name) and it's usually long eg: Han Mi Mo [former "Angels" member]
                
                clean = [n.strip() for n in nam.split('"') if n]
                for c in clean:
                    a = c.split('{')
                    for b in a:
                        b = b.replace('}', ' ')
                        charNew.append(b.strip()) 
                        # charNew is a list containing seperated names for one character name string

        casts += [actor] + charNew
        charNew = [''.join(ch.split()) for ch in charNew] # change to single word format (eg: 'Gong Yoo'->'GongYoo')
        actor = ''.join(actor.split()) # change to single word format
        
        roles.at[index, 'CharName'] = ','.join(charNew) 
        # after we seperate different character names, we modify on roles df so when we need to use it in 
        # the next notebook, we don't need to do this again and can split by comma
        
        roles.at[index, 'ActorName'] = actor # modify on roles df
        for key in charNew + [actor]:
            if 'Room' not in key: # eg: Room 610 is a name refer to the couple, 
                                  # but in this case we don't include it in dict because we cannot assign gender
                if gender == 'Female':
                    cast_dict[key] = ['girl','female','woman']
                else:
                    cast_dict[key] = ['boy','male','man']

    return cast_dict, casts

#### Replace all different spellings to all same format:
- we do this for all character names and real names <br>
eg: 'Gong Yoo' -> 'GongYoo' (real name: Gong Yoo) <br>
eg: 'go-eun' -> 'KimGoEun' (real name: Kim Go Eun)

In [13]:
def replace_cast(doc):
    name_matcher = Matcher(nlp.vocab)
    for cast in casts:
        cast_pattern = []
        name = cast.split()
        if len(name)==1:
            char1 = name[0].lower()
            pt1 = [{'LOWER': char1}]
            name_matcher.add(cast, None, pt1)
        elif len(name)==2:
            char1 = name[0].lower()
            char2 = name[1].lower()
            pt1 = [{'LOWER': char1+char2}]
            pt2 = [{'LOWER': char1[0]+char2[0]}]
            pt3 = [{'LOWER': char1}, {'TEXT': '-', 'OP': '?'}, {'LOWER': char2}] 
            name_matcher.add(cast, None, pt1)
            name_matcher.add(cast, None, pt2)
            name_matcher.add(cast, None, pt3)
        elif len(name)==3:
            char1 = name[0].lower()
            char2 = name[1].lower()
            char3 = name[2].lower()
            pt1 = [{'LOWER': char1, 'OP': '?'}, {'LOWER': char2+char3}] 
            pt2 = [{'LOWER': char1[0]+char2[0]+char3[0]}]
            pt3 = [{'LOWER': char1, 'OP': '?'}, {'TEXT': '-', 'OP': '?'}, {'LOWER': char2}, {'TEXT': '-', 'OP': '?'}, {'LOWER': char3}]
            pt4 = [{'LOWER': char1+char2+char3}] 
            name_matcher.add(cast, None, pt1)
            name_matcher.add(cast, None, pt2)
            name_matcher.add(cast, None, pt3)
            name_matcher.add(cast, None, pt4)
        elif len(name)==4:
            char1 = name[0].lower()
            char2 = name[1].lower()
            char3 = name[2].lower()
            char4 = name[3].lower()
            pt1 = [{'LOWER': char1, 'OP': '?'},{'LOWER': char2, 'OP': '?'}, {'LOWER': char3+char4}] 
            pt2 = [{'LOWER': char2[0]+char3[0]+char4[0]}]
            pt3 = [{'LOWER': char1, 'OP': '?'},{'LOWER': char2, 'OP': '?'}, {'TEXT': '-', 'OP': '?'}, {'LOWER': char3}, {'TEXT': '-', 'OP': '?'}, {'LOWER': char4}]
            pt4 = [{'LOWER': char2+char3+char4}] 
            pt5 = [{'LOWER': char1+char2+char3+char4}] 
            name_matcher.add(cast, None, pt1)
            name_matcher.add(cast, None, pt2)
            name_matcher.add(cast, None, pt3)
            name_matcher.add(cast, None, pt4)
            name_matcher.add(cast, None, pt5)
    count = 0
    name_matches = name_matcher(doc)
    while count < len(name_matches):
        match_id, start, end = name_matches[count]
        castName = nlp.vocab.strings[match_id]
        rep = ''.join(castName.split())
        newDoc = doc[:start].text+' '+rep+' '+doc[end:].text
        newDoc = re.sub(r'\s+', ' ', newDoc)
        doc = nlp.make_doc(newDoc.strip()) # this will speed up things because make_doc will only run the tokenizer
        name_matches = name_matcher(doc)
        count += 1
    return doc

# we replace the first match, then we match again, this time because we change to eg 'KimGoEun',
# so it'll be matched again, so +1 each time

#### Pronoun replacement:
- we add 3 components to the pipeline: <br>
`replace_cast` (match and change different spellings) <br>
`entity_ruler` (when we change the format of the name, eg: Gong Yoo -> GongYoo, the model may predict it wrongly (we tried and it is predicted as ORG, but this is an actor so it should be PERSON), we need to change everyone in the cast list as PERSON, because neuralcoref is using named entity recognizer (NER) results from spacy model) <br>
`neuralcoref` (for pronoun replacement) <br>


- these three components are changed whenever the drama changes (when we have a new drama, we'll remove them and add again. We don't want this to buildup, because:
1. there are some special names that only appear in this drama as a person's name but not in others <br>
eg: Sunny in `Goblin`. sunny is not a name when it appears in other drama, we don't want to mismatch/misclassify it in other drama
2. Korean names are similar (eg: Joo Ji Won / Han Ji Won / Kim Ji Won) <br>
Sometimes people will use Ji Won to refer to the person. This is context-specific (i.e it should only refer to the person acting in the drama not someone else). If this builds up, `Ji Won` may be matched to some other person who doesn't act in this drama, which is not correct

In [14]:
reviews = list(dramas['review'])
titles = list(dramas['title'].unique())
i = 0
for title in titles:
    l = 0
    title_df = dramas[dramas['title']==title]
    length = len(title_df)
    cast_dict, casts = make_casts(roles, title)
    nlp.add_pipe(replace_cast, first=True)
    for cast in casts:
        pattern = ''.join(cast.split())
        ruler.add_patterns([{"label": "PERSON", "pattern": pattern}])
    nlp.add_pipe(ruler, before="ner")
    neuralcoref.add_to_pipe(nlp,conv_dict=cast_dict)
    while l < length:
        doc = nlp(reviews[i])
        reviews[i] = doc._.coref_resolved
        i+=1
        l+=1
    nlp.remove_pipe('replace_cast')
    nlp.remove_pipe('entity_ruler')
    nlp.remove_pipe('neuralcoref')

In [15]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [16]:
resolve = pd.DataFrame({'title': dramas.title, 'review': reviews})

- export the new csv for the next notebook

In [17]:
resolve.to_csv('review_resolve_emoji.csv')
roles.to_csv('roles_mod.csv')