In [1]:
import os
import numpy as np
import pandas as pd
from relation_extractor import RelationExtractor

In [2]:
movie_data = pd.read_csv("raw_data/movie_data.csv")

In [3]:
"""
I noticed that there were duplicate entries for certain 
movies that were released in separate origins.  This 
removes those duplicates so we can use the rotten_tomatoes_link
as the primary index of the data.
"""
duplicates = movie_data.groupby("rotten_tomatoes_link").count() \
    .loc[np.any(movie_data.groupby("rotten_tomatoes_link") \
    .count() > 1, axis=1)].index

dup_indices = movie_data.loc[(movie_data["rotten_tomatoes_link"].isin(duplicates)) & \
    (movie_data["Origin/Ethnicity"] != "American")].index

movie_data = movie_data.loc[~movie_data.index.isin(dup_indices)].reset_index(drop=True)

In [4]:
def explode_column(input_df, column_name, new_column_name):
    """
    Function that will expand the string columns that have 
    values separated by commas.
    """
    df = input_df.copy(deep=True)
    exploded_df = df[column_name].astype(str) \
        .apply(lambda x: x.split(",")).explode() \
        .to_frame().rename(columns={column_name: new_column_name})
    df = df.merge(exploded_df[[new_column_name]], left_index=True, right_index=True)
    return df

In [5]:
explode_columns = {
    "genres": "genre",
    "directors": "director",
    "authors": "author",
    "actors": "actor"
}

exploded_dfs = {}
for c in explode_columns.keys():
    exploded_dfs[c] = explode_column(movie_data, c, explode_columns[c])
for c in exploded_dfs.keys():
    movie_data = movie_data.merge(
        exploded_dfs[c][[explode_columns[c]]], left_index=True, right_index=True
    )

In [6]:
relation_dict = {
    "rotten_tomatoes_link":"rtLink",
    "Title":"hasTitle",
    "director":"directedBy",
    "author":"authoredBy",
    "actor":"featuredActor",
    "genre":"hadGenre",
    "Release Year":"releasedOn",
    "production_company":"producedBy",
}
relations = []
for r in relation_dict:
    relations.append(relation_dict[r])

In [7]:
tabular_df = movie_data.rename(columns=relation_dict)[relations] \
    .drop_duplicates() \
    .reset_index(drop=True)

In [8]:
tabular_df.to_csv("data/tabular_df.csv")

In [51]:
raw_review_df = pd.read_csv("raw_data/rotten_tomatoes_critic_reviews.csv")
raw_review_df = raw_review_df.loc[raw_review_df["review_content"].astype(str) != 'nan'].reset_index(drop=True)


In [23]:
raw_review_df[raw_review_df["top_critic"] == True].iloc[500]["review_content"]

"The Intruder ... is exhilarating and exhausting, the kind of picture you don't bounce back from immediately."

In [10]:
text = raw_review_df.iloc[0]["review_content"]

In [24]:
doc = nlp("It is not a fantasy adventure that fuses Greek mythology to contemporary American places and values.")
print(text)

A fantasy adventure that fuses Greek mythology to contemporary American places and values. Anyone around 15 (give or take a couple of years) will thrill to the visual spectacle


In [18]:
class RelationExtractor():
    
    
    def __init__(self, entity_id, doc):
        self.entity_id = entity_id
        self.doc = doc 
        self.relations = self.extract()


    def extract(self):
        """
        Function that extracts a set of triples from the 
        Spacy doc object in the form [(triple), (triple), ...]
        """
        triples = []
        triples += self.extract_location_relation(self.entity_id, self.doc)
        return triples


    @staticmethod
    def extract_location_relation(entity_id, doc):
        triples = []
        for ent in doc.ents:
            if ent.label_ == "GPE":
                triple = (entity_id, "hasLocationTheme", ent.text)
                triples.append(triple)
        return triples


In [54]:
from transformers import pipeline

In [55]:
ner = pipeline("ner")

Downloading: 100%|██████████| 998/998 [00:00<00:00, 315kB/s]
Downloading: 100%|██████████| 1.33G/1.33G [01:18<00:00, 17.0MB/s]
Downloading: 100%|██████████| 213k/213k [00:00<00:00, 821kB/s] 
Downloading: 100%|██████████| 60.0/60.0 [00:00<00:00, 20.3kB/s]


In [64]:
text = "The film was set in New York City but not in London and it had Charles de Gaul in it."
t = ner(text)

In [154]:
def process_ner_output(ner_output):
    """
    Function that processes the Hugging Face NER output into
    a list of tuples, [(entity: entity_type), ...]
    """

    def _combine_tokens(token_list):
        """
        Function that combines a list of tokens that may contain
        subwords (and ##) into a single phrase.
        """
        ent = ""
        for t in token_list:
            if "#" in t:
                ent = ent[:len(ent)-1]
                ent += f"{t.replace('#', '')} "
            else:
                ent += f"{t} "
        return ent

    i0 = 0
    entity = []
    entity_type = set()
    entity_list = []
    entity_type_list = []
    for i in ner_output:
        if i["index"] > i0 + 1:
            if len(entity) > 0:
                entity_list.append(entity)
                entity_type_list.append(entity_type)
            entity = []
            entity_type = set()
        i0 = i["index"]
        entity.append(i["word"])
        entity_type.add(i["entity"])
    if len(entity) > 0:
        entity_list.append(entity)
        entity_type_list.append(entity_type)
    results = []
    for i, raw_ent in enumerate(entity_list):
        ent = _combine_tokens(raw_ent)
        raw_ent_type = next(iter(entity_type_list[i]))
        result = (ent, raw_ent_type.replace("I-", ""))
        results.append(result)
    return results


In [150]:
ner(t)

[{'word': 'Um', 'score': 0.9985164999961853, 'entity': 'I-PER', 'index': 1},
 {'word': '##a', 'score': 0.9963938593864441, 'entity': 'I-PER', 'index': 2},
 {'word': 'T', 'score': 0.999076247215271, 'entity': 'I-PER', 'index': 3},
 {'word': '##hur', 'score': 0.9789679646492004, 'entity': 'I-PER', 'index': 4},
 {'word': '##man', 'score': 0.9777899384498596, 'entity': 'I-PER', 'index': 5},
 {'word': 'Me', 'score': 0.9844141006469727, 'entity': 'I-PER', 'index': 7},
 {'word': '##dus', 'score': 0.8264440298080444, 'entity': 'I-PER', 'index': 8},
 {'word': '##a', 'score': 0.9489302635192871, 'entity': 'I-PER', 'index': 9}]

In [160]:
for i in range(1000):
    t = raw_review_df.iloc[i]["review_content"]
    p = process_ner_output(ner(t))
    if len(p) > 0:
        print(p)

[('Greek ', 'MISC'), ('American ', 'MISC')]
[('Uma Thurman ', 'PER'), ('Medusa ', 'PER')]
[('Harry Potter ', 'MISC')]
[('The Lightning Thief ', 'MISC'), ('Potter ', 'PER')]
[('The Lightning Thief ', 'MISC'), ('Hogwarts ', 'MISC')]
[('Harry Potter ', 'MISC'), ('Chris Columbus ', 'PER')]
[('Percy Jackson ', 'MISC'), ('Greek ', 'MISC'), ('Disney Channel ', 'MISC')]
[('Columbus ', 'PER')]
[('Rick Riordan ', 'PER'), ('Percy Jackson and the Olympians ', 'MISC'), ('Harry Potter ', 'MISC')]
[('Chris Columbus ', 'PER'), ('Rick Riordan ', 'PER')]
[('Chris Columbus ', 'PER')]
[('Percy Jackson ', 'PER'), ('Harry Potter ', 'MISC')]
[('British ', 'MISC')]
[('Riordan ', 'MISC')]
[('Greek ', 'MISC')]
[('Percy Jackson ', 'PER'), ('Harry Potter ', 'MISC')]
[('Medusa ', 'MISC')]
[('Rick Riordan ', 'PER'), ('Chris Columbus ', 'PER')]
[('The Lightning Thief ', 'MISC')]
[('Harry Potter ', 'PER')]
[('Columbus ', 'PER')]
[('Harry Potter ', 'MISC'), ('Lord of the Rings ', 'MISC')]
[('Percy Jackson & the Olympi

KeyboardInterrupt: 

In [78]:
i0 = 0
entity = []
entity_list = []
for i in t:
    print(i, i["index"], i0)
    if i["index"] > i0 + 1:
        entity_list.append(entity)
        entity = []
    i0 = i["index"]
    entity.append(i["word"])
entity_list.append(entity)

{'word': 'New', 'score': 0.9996168613433838, 'entity': 'I-LOC', 'index': 6} 6 0
{'word': 'York', 'score': 0.9995555877685547, 'entity': 'I-LOC', 'index': 7} 7 6
{'word': 'City', 'score': 0.9994433522224426, 'entity': 'I-LOC', 'index': 8} 8 7
{'word': 'London', 'score': 0.9996890425682068, 'entity': 'I-LOC', 'index': 12} 12 8
{'word': 'Charles', 'score': 0.9985857605934143, 'entity': 'I-PER', 'index': 16} 16 12
{'word': 'de', 'score': 0.9957852959632874, 'entity': 'I-PER', 'index': 17} 17 16
{'word': 'G', 'score': 0.9800652265548706, 'entity': 'I-PER', 'index': 18} 18 17
{'word': '##aul', 'score': 0.9558916091918945, 'entity': 'I-PER', 'index': 19} 19 18


In [112]:
entity_list

[[], ['New', 'York', 'City'], ['London'], ['Charles', 'de', 'G', '##aul']]

In [110]:
s = entity_list[3]
ent = ""
for w in s:
    if "#" in w:
        ent = ent[:len(ent) - 1]
        ent += w.replace("#", "")
    else:
        ent += f"{w} "

In [111]:
ent

'Charles de Gaul'

In [43]:
row = raw_review_df.iloc[3]
doc = nlp(row["review_content"])

In [45]:
print(doc)
for e in doc.ents:
    print(e.text, e.label_)

Whether audiences will get behind The Lightning Thief is hard to predict. Overall, it's an entertaining introduction to a promising new world -- but will the consuming shadow of Potter be too big to break free of?
The Lightning Thief WORK_OF_ART
Potter GPE


In [53]:
for i in range(1000,2000):
    row = raw_review_df.iloc[i]
    entity_id = row["rotten_tomatoes_link"]
    doc = nlp(row["review_content"])
    t = RelationExtractor(entity_id, doc)
    if len(t.relations) > 0:
        print(i, t.relations)

1017 [('m/10004209-tristan_and_isolde', 'hasLocationTheme', 'Tristan')]
1018 [('m/10004209-tristan_and_isolde', 'hasLocationTheme', "Benicio del Toro's")]
1021 [('m/10004209-tristan_and_isolde', 'hasLocationTheme', 'Romeo')]
1035 [('m/10004209-tristan_and_isolde', 'hasLocationTheme', 'Tristan'), ('m/10004209-tristan_and_isolde', 'hasLocationTheme', 'Hollywood')]
1040 [('m/10004209-tristan_and_isolde', 'hasLocationTheme', 'Prince Valiant')]
1045 [('m/10004209-tristan_and_isolde', 'hasLocationTheme', 'Kingdom')]
1057 [('m/10004209-tristan_and_isolde', 'hasLocationTheme', 'Western Ireland'), ('m/10004209-tristan_and_isolde', 'hasLocationTheme', 'the Czech Republic')]
1058 [('m/10004209-tristan_and_isolde', 'hasLocationTheme', 'England')]
1059 [('m/10004209-tristan_and_isolde', 'hasLocationTheme', 'Tristan')]
1062 [('m/10004209-tristan_and_isolde', 'hasLocationTheme', 'Britain')]
1065 [('m/10004209-tristan_and_isolde', 'hasLocationTheme', 'Tristan')]
1071 [('m/10004209-tristan_and_isolde',

In [26]:
t.relations

[('m/10000_bc', 'hasLocationTheme', 'B.C.')]

In [40]:
for i, token in enumerate(doc.noun_chunks):
    print(token.text, "***")
    for j in token.subtree:
        print(j.text)

It ***
It
a fantasy adventure ***
a
fantasy
adventure
that
fuses
Greek
mythology
to
contemporary
American
places
and
values
Greek mythology ***
Greek
mythology
contemporary American places ***
contemporary
American
places
and
values
values ***
values


In [59]:
def generate_theme_df(movie_data):
    theme_df = movie_data[["rotten_tomatoes_link", "Title", "genre", "Plot"]] \
        .drop_duplicates().reset_index(drop=True)
    genres = theme_df["genre"].unique()
    theme_df["indicator"] = 1
    theme_pivot = theme_df \
        .pivot(index="rotten_tomatoes_link", columns="genre", values="indicator") \
        .fillna(0).reset_index()
    theme_df = theme_df[["rotten_tomatoes_link", "Title", "Plot"]] \
        .drop_duplicates().merge(theme_pivot, on="rotten_tomatoes_link") \
        .reset_index(drop=True)
    return theme_df


vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,3))
clf = ExtraTreesClassifier()
X = vectorizer.fit_transform(theme_df["Plot"])
clf.fit(X, theme_df["Comedy"])
importances = clf.feature_importances_
names = vectorizer.get_feature_names()

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)