In [2]:
import os
import numpy as np
import pandas as pd
from ner import ner, process_ner_output
from relation_extractor import RelationExtractor

In [33]:
movie_data = pd.read_csv("raw_data/movie_data.csv")

In [34]:
"""
I noticed that there were duplicate entries for certain 
movies that were released in separate origins.  This 
removes those duplicates so we can use the rotten_tomatoes_link
as the primary index of the data.
"""
duplicates = movie_data.groupby("rotten_tomatoes_link").count() \
    .loc[np.any(movie_data.groupby("rotten_tomatoes_link") \
    .count() > 1, axis=1)].index

dup_indices = movie_data.loc[(movie_data["rotten_tomatoes_link"].isin(duplicates)) & \
    (movie_data["Origin/Ethnicity"] != "American")].index

movie_data = movie_data.loc[~movie_data.index.isin(dup_indices)].reset_index(drop=True)

In [35]:
def explode_column(input_df, column_name, new_column_name):
    """
    Function that will expand the string columns that have 
    values separated by commas.
    """
    df = input_df.copy(deep=True)
    exploded_df = df[column_name].astype(str) \
        .apply(lambda x: x.split(",")).explode() \
        .to_frame().rename(columns={column_name: new_column_name})
    df = df.merge(exploded_df[[new_column_name]], left_index=True, right_index=True)
    return df

In [36]:
explode_columns = {
    "genres": "genre",
    "directors": "director",
    "authors": "author",
    "actors": "actor"
}

exploded_dfs = {}
for c in explode_columns.keys():
    exploded_dfs[c] = explode_column(movie_data, c, explode_columns[c])
for c in exploded_dfs.keys():
    movie_data = movie_data.merge(
        exploded_dfs[c][[explode_columns[c]]], left_index=True, right_index=True
    )

In [37]:
relation_dict = {
    "rotten_tomatoes_link":"rtLink",
    "Title":"hasTitle",
    "director":"directedBy",
    "author":"authoredBy",
    "actor":"featuredActor",
    "genre":"hadGenre",
    "Release Year":"releasedOn",
    "production_company":"producedBy",
}
relations = []
for r in relation_dict:
    relations.append(relation_dict[r])

In [38]:
tabular_df = movie_data.rename(columns=relation_dict)[relations] \
    .drop_duplicates() \
    .reset_index(drop=True)
tabular_df.rename(columns={"hadGenre":"hasGenre", "rtLink":"entity_id"}, inplace=True)
known_df = tabular_df.melt(id_vars="entity_id", value_vars=['hasTitle', 'directedBy', 'authoredBy', 'featuredActor',
       'hasGenre', 'releasedOn', 'producedBy']).drop_duplicates().reset_index(drop=True).rename(columns={"variable":"relation"})
known_df.to_csv("data/known_df.csv", index=False)

In [39]:
tabular_df.to_csv("data/tabular_df.csv", index=False)

In [5]:
raw_review_df = pd.read_csv("raw_data/rotten_tomatoes_critic_reviews.csv")
raw_review_df = raw_review_df.loc[raw_review_df["review_content"].astype(str) != 'nan'].reset_index(drop=True)
raw_review_df.to_csv("data/raw_review_df.csv")
review_df = raw_review_df[["rotten_tomatoes_link", "review_content"]] \
    .rename(columns={"rotten_tomatoes_link": "entity_id", "review_content": "text"})
review_df.to_csv("data/review_df.csv", index=False)

In [6]:
review_df = pd.read_csv("data/review_df.csv")

In [None]:
def extract_ner_relation(entity_id, text):
    ner_output = ner(text)
    processed_ner_output = process_ner_output(ner_output)
    relations = []
    for r in processed_ner_output:
        relation = (entity_id, f"has{r[1]}Theme", r[0])
        relations.append(relation)
    return relations

In [32]:
ner(["georgia", "florida"])

[{'word': 'g', 'score': 0.7148700952529907, 'entity': 'I-LOC', 'index': 1},
 {'word': '##eor', 'score': 0.8759456276893616, 'entity': 'I-LOC', 'index': 2},
 {'word': '##gia', 'score': 0.6108313202857971, 'entity': 'I-LOC', 'index': 3}]

In [43]:
tabular_df = pd.read_csv("data/tabular_df.csv")
tabular_df.rename(columns={"hadGenre":"hasGenre", "rtLink":"entity_id"}, inplace=True)
known_df = tabular_df.melt(id_vars="entity_id", value_vars=['hasTitle', 'directedBy', 'authoredBy', 'featuredActor',
       'hasGenre', 'releasedOn', 'producedBy']).drop_duplicates().reset_index(drop=True).rename(columns={"variable":"relation"})
known_df.to_csv("data/known_df.csv", index=False)

In [41]:
t = KnowledgeGraphGenerator(input_data_list=[review_df], known_data_list=[known_df])

KeyboardInterrupt: 

In [40]:
t.knowledge_graph_df

Unnamed: 0,entity_id,relation,value
0,m/0814255,hasMISCTheme,Greek
1,m/0814255,hasMISCTheme,American
0,m/0814255,hasPERTheme,Uma Thurman
1,m/0814255,hasPERTheme,Medusa
0,m/0814255,hasMISCTheme,Harry Potter
...,...,...,...
302197,m/zoolander_2,producedBy,Paramount Pictures
302198,m/zoot_suit,producedBy,MCA Universal Home Video
302199,m/zootopia,producedBy,Walt Disney Animation Studios
302200,m/zorba_the_greek,producedBy,Fox


In [38]:
class KnowledgeGraphGenerator:
    def __init__(self, input_data_list=None, known_data_list=None):
        self.input_data_list = input_data_list
        self.known_data_list = known_data_list
        self.knowledge_graph_df = self.generate()


    def generate(self):
        """
        Function that will generate a knowledge graph based 
        on the input_data_list, which must have the form 
        [DataFrame, DataFrame...], where each DataFrame must
        have columns labeled entity_id and text.  The relations
        will be extracted for each row using the RelationExtractor
        extract method using the text column.  Known knowledge graph
        triples can be provided in a list of DataFrames by using
        the optional known_data_list argument.  The known DataFrames
        should be provided with known relations such that each 
        DataFrame has columns entity_id, relation and value.  
        """
        columns = ["entity_id", "relation", "value"]
        kg_df = pd.DataFrame(columns=columns)
        # Extract relations from input_data_list and compile DataFrame.
        if self.input_data_list is not None:
            for df in self.input_data_list:
                for i in range(len(df)):
                    row = df.iloc[i]
                    extractor = RelationExtractor(row["entity_id"], row["text"])
                    new_df = pd.DataFrame(extractor.relations, columns=columns)
                    kg_df = pd.concat([kg_df, new_df])
        # Combine the extracted relations DataFrame with known_data_list.
        if self.known_data_list is not None:
            for new_df in self.known_data_list:
                kg_df = pd.concat([kg_df, new_df])
        return kg_df





In [59]:
def generate_theme_df(movie_data):
    theme_df = movie_data[["rotten_tomatoes_link", "Title", "genre", "Plot"]] \
        .drop_duplicates().reset_index(drop=True)
    genres = theme_df["genre"].unique()
    theme_df["indicator"] = 1
    theme_pivot = theme_df \
        .pivot(index="rotten_tomatoes_link", columns="genre", values="indicator") \
        .fillna(0).reset_index()
    theme_df = theme_df[["rotten_tomatoes_link", "Title", "Plot"]] \
        .drop_duplicates().merge(theme_pivot, on="rotten_tomatoes_link") \
        .reset_index(drop=True)
    return theme_df


vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,3))
clf = ExtraTreesClassifier()
X = vectorizer.fit_transform(theme_df["Plot"])
clf.fit(X, theme_df["Comedy"])
importances = clf.feature_importances_
names = vectorizer.get_feature_names()

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)