In [1]:
import os
import numpy as np
import pandas as pd
import spacy
from spacy import displacy
from relation_extractor import RelationExtractor, negation_detector, value_smoother
from nlp import nlp, ner, process_ner_output

In [11]:
movie_data = pd.read_csv("raw_data/movie_data.csv")

In [12]:
"""
I noticed that there were duplicate entries for certain 
movies that were released in separate origins.  This 
removes those duplicates so we can use the rotten_tomatoes_link
as the primary index of the data.
"""
duplicates = movie_data.groupby("rotten_tomatoes_link").count() \
    .loc[np.any(movie_data.groupby("rotten_tomatoes_link") \
    .count() > 1, axis=1)].index

dup_indices = movie_data.loc[(movie_data["rotten_tomatoes_link"].isin(duplicates)) & \
    (movie_data["Origin/Ethnicity"] != "American")].index

movie_data = movie_data.loc[~movie_data.index.isin(dup_indices)].reset_index(drop=True)

In [13]:
def explode_column(input_df, column_name, new_column_name):
    """
    Function that will expand the string columns that have 
    values separated by commas.
    """
    df = input_df.copy(deep=True)
    exploded_df = df[column_name].astype(str) \
        .apply(lambda x: x.split(",")).explode() \
        .to_frame().rename(columns={column_name: new_column_name})
    df = df.merge(exploded_df[[new_column_name]], left_index=True, right_index=True)
    return df

In [14]:
explode_columns = {
    "genres": "genre",
    "directors": "director",
    "authors": "author",
    "actors": "actor"
}

exploded_dfs = {}
for c in explode_columns.keys():
    exploded_dfs[c] = explode_column(movie_data, c, explode_columns[c])
for c in exploded_dfs.keys():
    movie_data = movie_data.merge(
        exploded_dfs[c][[explode_columns[c]]], left_index=True, right_index=True
    )

In [15]:
relation_dict = {
    "rotten_tomatoes_link":"entity_id",
    "Title":"has title",
    "director":"directed by",
    "author":"authored by",
    "actor":"featured actor",
    "genre":"has genre",
    "Release Year":"released on",
    "production_company":"produced by",
}
relations = []
for r in relation_dict:
    relations.append(relation_dict[r])

In [16]:
tabular_df = movie_data.rename(columns=relation_dict)[relations] \
    .drop_duplicates() \
    .reset_index(drop=True)
known_df = tabular_df.melt(id_vars="entity_id", value_vars=['has title', 'directed by', 'authored by', 'featured actor',
       'has genre', 'released on', 'produced by']).drop_duplicates().reset_index(drop=True).rename(columns={"variable":"relation"})
known_df["value"] = known_df["value"].astype(str).apply(value_smoother)
known_df.to_csv("data/known_df.csv", index=False)

In [5]:
raw_review_df = pd.read_csv("raw_data/rotten_tomatoes_critic_reviews.csv")
raw_review_df = raw_review_df.loc[raw_review_df["review_content"].astype(str) != 'nan'].reset_index(drop=True)
raw_review_df.to_csv("data/raw_review_df.csv")
review_df = raw_review_df[["rotten_tomatoes_link", "review_content"]] \
    .rename(columns={"rotten_tomatoes_link": "entity_id", "review_content": "text"})
review_df.to_csv("data/review_df.csv", index=False)

In [17]:
raw_plot_df = pd.read_csv("raw_data/data/wiki_movie_plots_deduped.csv")
plot_df = raw_plot_df.merge(movie_data.rename(columns={
    "rotten_tomatoes_link": "entity_id"})[["entity_id", "Title"]] \
    .drop_duplicates(), on="Title") \
    .rename(columns={"Plot": "text"})[["entity_id", "text"]]
plot_df.to_csv("data/plot_df.csv", index=False)
review_plot_df = pd.concat([review_df, plot_df]).reset_index(drop=True)
review_plot_df.to_csv("data/review_plot_df.csv", index=False)

In [4]:
review_df = pd.read_csv("data/review_df.csv")

In [3]:
i = np.random.randint(1, len(review_df))
text = review_df.iloc[i]["text"]
entity_id = review_df.iloc[i]["entity_id"]
print(i)
print(text)

34372
Our familiarity with the actors, and their comfort in this period setting, lend the piece an unexpected air of naturalism.


In [14]:
for n in nlp(text).noun_chunks:
    print(n, not negation_detector(n.root))
    for t in n:
        print("*", t.text.lower())
print("____________________________________________________")
for t in nlp(text):
    print(t.text, t.dep_, t.pos_, t.is_stop)

Fairbrass True
* fairbrass
both the drama True
* both
* the
* drama
scenes True
* scenes
he True
* he
parts True
* parts
Jason Statham movies True
* jason
* statham
* movies
his calling True
* his
* calling
____________________________________________________
Fairbrass nsubj NOUN False
moves ROOT VERB False
stiffly acomp VERB False
through prep ADP True
both preconj CCONJ True
the det DET True
drama pobj NOUN False
and cc CCONJ True
fight conj VERB False
scenes dobj NOUN False
- punct PUNCT False
as mark ADP True
if mark ADP True
he nsubj PRON True
were advcl VERB True
underwater acomp ADJ False
. punct PUNCT False
Supporting csubj VERB False
parts dobj NOUN False
in prep ADP True
Jason compound PROPN False
Statham compound PROPN False
movies pobj NOUN False
may aux VERB True
be ROOT VERB True
his poss DET True
calling attr NOUN False
, punct PUNCT False
instead advmod ADV False
. punct PUNCT False


In [4]:
r = RelationExtractor("entity_id", text)
for i in r.relations:
    print(i)

Our
familiarity
the
actors
their
comfort
this
period
the
piece
an
unexpected
air
naturalism
('entity_id', 'features the theme', 'our familiarity')
('entity_id', 'features the theme', 'the actors')
('entity_id', 'features the theme', 'their comfort')
('entity_id', 'features the theme', 'this period')
('entity_id', 'features the theme', 'the piece')
('entity_id', 'features the theme', 'an unexpected air')
('entity_id', 'features the theme', 'naturalism')


In [13]:
def extract_noun_relations(entity_id, text):
    doc = nlp(text)
    relations = []
    for n in doc.noun_chunks:
        noun_chunk = []
        if not negation_detector(n.root):
            for t in n:
                noun_chunk.append(t.text.lower())
            relations.append(
                (entity_id, "features the theme", " ".join(noun_chunk))
            )
    return relations

extract_noun_relations("entity_id", text)

[('entity_id', 'features the theme', 'fairbrass'),
 ('entity_id', 'features the theme', 'both the drama'),
 ('entity_id', 'features the theme', 'scenes'),
 ('entity_id', 'features the theme', 'he'),
 ('entity_id', 'features the theme', 'parts'),
 ('entity_id', 'features the theme', 'jason statham movies'),
 ('entity_id', 'features the theme', 'his calling')]

In [26]:
text = ' "A lot of bad stuff'
def value_smoother(text):
    text = text.replace(" -", "-")
    text = text.replace("- ", "-")
    text = text.replace("' s", "'s")
    if text.count('"') == 1:
        text = text.replace('"', "")
    while text[0] == " ":
        text = text[1:]
    return text
value_smoother(text)

'A lot of bad stuff'

In [18]:
def find_composite_noun_chunk(noun_chunk):
    composite_noun_chunk_idx = set()
    composite_noun_chunk = []
    for n in noun_chunk.doc.noun_chunks:
        if n == noun_chunk:
            for t in n:
                composite_noun_chunk_idx.add(t.i)
        elif n.root.head.head == noun_chunk.root:
            for t in n:
                composite_noun_chunk_idx.add(t.i)
            composite_noun_chunk_idx.add(n.root.head.i)
    for t in noun_chunk.doc:
        if t.i in composite_noun_chunk_idx:
            composite_noun_chunk.append(t)
    return composite_noun_chunk


for n in doc.noun_chunks:
    print(find_composite_noun_chunk(n))

[A, lot, of, bad, stuff]
[bad, stuff]


In [15]:
raw_plot_df = pd.read_csv("raw_data/data/wiki_movie_plots_deduped.csv")

In [16]:
raw_plot_df = pd.read_csv("raw_data/data/wiki_movie_plots_deduped.csv")
plot_df = raw_plot_df.merge(movie_data.rename(columns={
    "rotten_tomatoes_link": "entity_id"})[["entity_id", "Title"]] \
    .drop_duplicates(), on="Title") \
    .rename(columns={"Plot": "text"})[["entity_id", "text"]]

In [3]:
# Doesn't really work.
class NounChunks:
    def __init__(self, doc):
        self.doc = doc
        self.details_list = self.generate_details_list()
        self.roots = self.generate_roots()
        self.values = self.generate_values()
        self.spans = self.generate_spans()

    @staticmethod
    def extract_noun_chunk_details(noun_chunk):
        details = {}
        details["root"] = noun_chunk.root
        details["span"] = list(range(noun_chunk.start, noun_chunk.end))
        details["text"] = noun_chunk.text
        return details
        
    def generate_details_list(self):
        details_list = []
        for n in doc.noun_chunks:
            details_list.append(
                self.extract_noun_chunk_details(n)
            )
        return details_list
    def generate_roots(self):
        roots = []
        for d in self.details_list:
            roots.append(d["root"])
        return roots
    def generate_spans(self):
        spans = []
        for d in self.details_list:
            spans += d["span"]
        return spans
    def generate_values(self):
        values = []
        for d in self.details_list:
            values.append(d["text"])
        return values
    def generate_relations(self, entity_id):
        relations = []
        for n in self.doc.noun_chunks:
            if n.root.head.i in self.spans:
                base_relation = n.root.head.head
                relation = [n.root.head.head.text]
            else:
                base_relation = n.root.head
                relation = [n.root.head.text]
            if base_relation.dep_ == "prep" and base_relation.head.pos_ == "VERB":
                relation.append(base_relation.head.text)
            relation.reverse()
            relation = " ".join(relation)
            relations.append(relation.lower())
        relations_dict = {}
        relations_dict["entity_id"] = entity_id
        relations_dict["relation"] = relations
        relations_dict["value"] = self.values
        relations_df = pd.DataFrame().from_dict(relations_dict)
        return relations_df




In [59]:
def generate_theme_df(movie_data):
    theme_df = movie_data[["rotten_tomatoes_link", "Title", "genre", "Plot"]] \
        .drop_duplicates().reset_index(drop=True)
    genres = theme_df["genre"].unique()
    theme_df["indicator"] = 1
    theme_pivot = theme_df \
        .pivot(index="rotten_tomatoes_link", columns="genre", values="indicator") \
        .fillna(0).reset_index()
    theme_df = theme_df[["rotten_tomatoes_link", "Title", "Plot"]] \
        .drop_duplicates().merge(theme_pivot, on="rotten_tomatoes_link") \
        .reset_index(drop=True)
    return theme_df


vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,3))
clf = ExtraTreesClassifier()
X = vectorizer.fit_transform(theme_df["Plot"])
clf.fit(X, theme_df["Comedy"])
importances = clf.feature_importances_
names = vectorizer.get_feature_names()

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)