In [1]:
import os
import numpy as np
import pandas as pd
import spacy
from spacy import displacy
from relation_extractor import RelationExtractor
from nlp import nlp, ner, process_ner_output

In [2]:
movie_data = pd.read_csv("raw_data/movie_data.csv")

In [3]:
"""
I noticed that there were duplicate entries for certain 
movies that were released in separate origins.  This 
removes those duplicates so we can use the rotten_tomatoes_link
as the primary index of the data.
"""
duplicates = movie_data.groupby("rotten_tomatoes_link").count() \
    .loc[np.any(movie_data.groupby("rotten_tomatoes_link") \
    .count() > 1, axis=1)].index

dup_indices = movie_data.loc[(movie_data["rotten_tomatoes_link"].isin(duplicates)) & \
    (movie_data["Origin/Ethnicity"] != "American")].index

movie_data = movie_data.loc[~movie_data.index.isin(dup_indices)].reset_index(drop=True)

In [4]:
def explode_column(input_df, column_name, new_column_name):
    """
    Function that will expand the string columns that have 
    values separated by commas.
    """
    df = input_df.copy(deep=True)
    exploded_df = df[column_name].astype(str) \
        .apply(lambda x: x.split(",")).explode() \
        .to_frame().rename(columns={column_name: new_column_name})
    df = df.merge(exploded_df[[new_column_name]], left_index=True, right_index=True)
    return df

In [5]:
explode_columns = {
    "genres": "genre",
    "directors": "director",
    "authors": "author",
    "actors": "actor"
}

exploded_dfs = {}
for c in explode_columns.keys():
    exploded_dfs[c] = explode_column(movie_data, c, explode_columns[c])
for c in exploded_dfs.keys():
    movie_data = movie_data.merge(
        exploded_dfs[c][[explode_columns[c]]], left_index=True, right_index=True
    )

In [6]:
relation_dict = {
    "rotten_tomatoes_link":"entity_id",
    "Title":"has title",
    "director":"directed by",
    "author":"authored by",
    "actor":"featured actor",
    "genre":"has genre",
    "Release Year":"released on",
    "production_company":"produced by",
}
relations = []
for r in relation_dict:
    relations.append(relation_dict[r])

In [7]:
tabular_df = movie_data.rename(columns=relation_dict)[relations] \
    .drop_duplicates() \
    .reset_index(drop=True)
known_df = tabular_df.melt(id_vars="entity_id", value_vars=['has title', 'directed by', 'authored by', 'featured actor',
       'has genre', 'released on', 'produced by']).drop_duplicates().reset_index(drop=True).rename(columns={"variable":"relation"})
known_df.to_csv("data/known_df.csv", index=False)

In [39]:
tabular_df.to_csv("data/tabular_df.csv", index=False)

In [5]:
raw_review_df = pd.read_csv("raw_data/rotten_tomatoes_critic_reviews.csv")
raw_review_df = raw_review_df.loc[raw_review_df["review_content"].astype(str) != 'nan'].reset_index(drop=True)
raw_review_df.to_csv("data/raw_review_df.csv")
review_df = raw_review_df[["rotten_tomatoes_link", "review_content"]] \
    .rename(columns={"rotten_tomatoes_link": "entity_id", "review_content": "text"})
review_df.to_csv("data/review_df.csv", index=False)

In [2]:
review_df = pd.read_csv("data/review_df.csv")

In [59]:
def generate_theme_df(movie_data):
    theme_df = movie_data[["rotten_tomatoes_link", "Title", "genre", "Plot"]] \
        .drop_duplicates().reset_index(drop=True)
    genres = theme_df["genre"].unique()
    theme_df["indicator"] = 1
    theme_pivot = theme_df \
        .pivot(index="rotten_tomatoes_link", columns="genre", values="indicator") \
        .fillna(0).reset_index()
    theme_df = theme_df[["rotten_tomatoes_link", "Title", "Plot"]] \
        .drop_duplicates().merge(theme_pivot, on="rotten_tomatoes_link") \
        .reset_index(drop=True)
    return theme_df


vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,3))
clf = ExtraTreesClassifier()
X = vectorizer.fit_transform(theme_df["Plot"])
clf.fit(X, theme_df["Comedy"])
importances = clf.feature_importances_
names = vectorizer.get_feature_names()

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [2]:
review_df = pd.read_csv("data/review_df.csv")

In [3]:
i = np.random.randint(1, len(review_df))
text = review_df.iloc[i]["text"]
entity_id = review_df.iloc[i]["entity_id"]
print(text)

For those of us who love Cinema Paradiso, the news that its Italian director Giuseppe Tornatore and his musical soulmate Ennio Morricone were getting back together to do another coming-of-age film, Malena, gave us tantalizing hope. Malena fa


In [2]:
text = "For those of us who love Cinema Paradiso, the news that its Italian director Giuseppe Tornatore and his musical soulmate Ennio Morricone were getting back together to do another coming-of-age film, Malena, gave us tantalizing hope. Malena fa"

In [15]:
process_ner_output(ner("For those of us who love Cinema Paradiso, the news that its Italian director Giuseppe Tornatore and his musical soulmate Ennio Morricone were getting back together to do another coming-of-age film, Malena, gave us tantalizing hope. Malena fa"))

[('Cinema Paradiso', 'MISC'),
 ('Italian', 'MISC'),
 ('Giuseppe Tornatore', 'PER'),
 ('Ennio Morricone', 'PER'),
 ('Malena', 'MISC'),
 ('Malena', 'MISC')]

In [3]:
r = RelationExtractor("entity_id", text)
for i in r.relations:
    print(i)

('entity_id', 'features', 'Cinema Paradiso')
('entity_id', 'features', 'Italian')
('entity_id', 'features', 'Giuseppe Tornatore')
('entity_id', 'features', 'Ennio Morricone')
('entity_id', 'features', 'Malena')
('entity_id', 'features', 'Malena')
('entity_id', 'has', 'cinema paradiso')
('entity_id', 'has', 'news')
('entity_id', 'has', 'italian director')
('entity_id', 'has', 'giuseppe tornatore')
('entity_id', 'has', 'musical soulmate')
('entity_id', 'has', 'ennio morricone')
('entity_id', 'has', 'age')
('entity_id', 'has', 'malena')
('entity_id', 'has', 'hope')
('entity_id', 'is', 'italian')
('entity_id', 'is', 'musical')
('entity_id', 'getting', 'back')
('entity_id', 'getting', 'together')


In [None]:
r

In [10]:
"""
def negation_detector(token):
    head = token.head
    try:
        while head.pos_ != "VERB":
            head = head.head
        for token in head.children:
            if token.dep_ == "neg":
                return True
        return False
    except Exception:
        return False


def verb_extractor(token):
    head = token.head
    try:
        while head.pos_ != "VERB":
            if head == head.head:
                return None
            head = head.head
        return head.text
    except Exception:
        return None
"""
def extract_ad_relations(entity_id, text):
    doc = nlp(text)
    relations = []
    for t in doc:
        if not negation_detector(t):
            if t.pos_ in ("ADJ"):
                relations.append((entity_id, "is", t.text.lower()))
            if t.pos_ in ("ADV"):
                verb = verb_extractor(t).lower()
                relations.append((entity_id, verb, t.text.lower()))
    return relations

extract_ad_relations("balls", text)

[('balls', 'is', 'italian'),
 ('balls', 'is', 'musical'),
 ('balls', 'getting', 'back'),
 ('balls', 'getting', 'together')]

In [9]:
def negation_detector(token):
    head = token.head
    try:
        while head.pos_ != "VERB":
            if head == head.head:
                return False
            head = head.head
        for token in head.children:
            if token.dep_ == "neg":
                return True
        return False
    except Exception:
        return False


def verb_extractor(token):
    head = token.head
    try:
        while head.pos_ != "VERB":
            if head == head.head:
                return None
            head = head.head
        return head.text
    except Exception:
        return None


def extract_nounchunk_relations(entity_id, text):
    doc = nlp(text)
    relations = []
    for n in doc.noun_chunks:
        noun_chunk = []
        if not negation_detector(n.root):
            for t in n:
                if not t.is_stop:
                    noun_chunk.append(t.text.lower())
            relations.append((entity_id, "has", " ".join(noun_chunk)))
    return relations


def extract_ad_relations(entity_id, text):
    doc = nlp(text)
    relations = []
    for t in doc:
        if not negation_detector(t):
            if t.pos_ in ("ADJ"):
                relations.append((entity_id, "is", t.text.lower()))
            if t.pos_ in ("ADV"):
                verb = verb_extractor(t).lower()
                relations.append((entity_id, verb, t.text.lower()))
    return relations

In [3]:
# Doesn't really work.
class NounChunks:
    def __init__(self, doc):
        self.doc = doc
        self.details_list = self.generate_details_list()
        self.roots = self.generate_roots()
        self.values = self.generate_values()
        self.spans = self.generate_spans()

    @staticmethod
    def extract_noun_chunk_details(noun_chunk):
        details = {}
        details["root"] = noun_chunk.root
        details["span"] = list(range(noun_chunk.start, noun_chunk.end))
        details["text"] = noun_chunk.text
        return details
        
    def generate_details_list(self):
        details_list = []
        for n in doc.noun_chunks:
            details_list.append(
                self.extract_noun_chunk_details(n)
            )
        return details_list
    def generate_roots(self):
        roots = []
        for d in self.details_list:
            roots.append(d["root"])
        return roots
    def generate_spans(self):
        spans = []
        for d in self.details_list:
            spans += d["span"]
        return spans
    def generate_values(self):
        values = []
        for d in self.details_list:
            values.append(d["text"])
        return values
    def generate_relations(self, entity_id):
        relations = []
        for n in self.doc.noun_chunks:
            if n.root.head.i in self.spans:
                base_relation = n.root.head.head
                relation = [n.root.head.head.text]
            else:
                base_relation = n.root.head
                relation = [n.root.head.text]
            if base_relation.dep_ == "prep" and base_relation.head.pos_ == "VERB":
                relation.append(base_relation.head.text)
            relation.reverse()
            relation = " ".join(relation)
            relations.append(relation.lower())
        relations_dict = {}
        relations_dict["entity_id"] = entity_id
        relations_dict["relation"] = relations
        relations_dict["value"] = self.values
        relations_df = pd.DataFrame().from_dict(relations_dict)
        return relations_df




In [10]:
i = np.random.randint(1, len(review_df))
entity_id = review_df.iloc[i]["entity_id"]
doc = nlp(review_df.iloc[i]["text"])
nounchunks = NounChunks(doc)
relations_df = nounchunks.generate_relations(entity_id)
print(i)
print(doc.text)
print(relations_df)
# 808526

875527
If The Boxer doesn't quite score a knockout, that's because of such flaws as the too-sketchy development of the character of Maggie's son, who turns out to be pivotal. But the movie carries the day by aiming its strongest punches straight at the heart.
      entity_id    relation                        value
0   m/the_boxer       score                    The Boxer
1   m/the_boxer       score                   a knockout
2   m/the_boxer  's because                   such flaws
3   m/the_boxer          as  the too-sketchy development
4   m/the_boxer          of                the character
5   m/the_boxer          of                 Maggie's son
6   m/the_boxer       turns                          who
7   m/the_boxer     carries                    the movie
8   m/the_boxer     carries                      the day
9   m/the_boxer      aiming        its strongest punches
10  m/the_boxer   aiming at                    the heart


In [23]:
for t in nlp("the movie was bad but it definitely did not suck."):
    print(t.text, negation_detector(t))

the False
movie False
was False
bad False
but False
it True
definitely True
did True
not True
suck False
. True


In [19]:
def negation_detector(token):
    head = token.head
    try:
        while head.pos_ != "VERB":
            head = head.head
        for token in head.children:
            if token.dep_ == "neg":
                return True
        return False
    except Exception:
        return False

In [136]:
n = NounChunks(doc)    

In [138]:
n.generate_relations("dumb_movie")

Unnamed: 0,entity_id,relation,value
0,dumb_movie,feels like,a relic
1,dumb_movie,feels like,"a sad, weak swan song"
2,dumb_movie,of,cinema's most legendary filmmakers


In [65]:
for n in doc.noun_chunks:
    print(n.text, n.root.head.text, n.root.head.dep_, n.root.head.head.text)


a relic like prep Feels
a sad, weak swan song relic pobj like
cinema's most legendary filmmakers of prep one


In [80]:
for t in doc:
    print(t.text, t.dep_, t.pos_)


Feels ROOT VERB
like prep ADP
a det DET
relic pobj NOUN
, punct PUNCT
a det DET
sad amod ADJ
, punct PUNCT
weak amod ADJ
swan compound NOUN
song appos NOUN
from prep ADP
one pobj NUM
of prep ADP
cinema poss NOUN
's case PART
most advmod ADV
legendary amod ADJ
filmmakers pobj NOUN
. punct PUNCT
