In [21]:
import os
import numpy as np
import pandas as pd
import dask.dataframe as dd
from knowledge_graph_generator import KnowledgeGraphGenerator
from fusion import TransE, TransEFuser

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm.notebook import tqdm


In [17]:
df = pd.read_csv("data/kg_count_year_df.csv")
df = df.loc[(df["year_released"] == 2017)&(df["arc_count"] >= 500)].reset_index(drop=True)
df = df[["entity_id", "relation", "value"]]
kg_obj = KnowledgeGraphGenerator(known_data_list=[df])
transe = TransE(kg_obj)

In [18]:
idx_array = transe.gen_training_array(df)

In [29]:
transe.loss_fn(transe.forward(torch.LongTensor(idx_array)))

tensor(1.2491, grad_fn=<MeanBackward0>)

In [20]:
transe.idx2embeds(idx_array)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not numpy.int64

In [91]:
def filter_arcs(input_full_df, min_arc_count=25):
    full_df = input_full_df.copy(deep=True)
    full_df["arc"] = full_df["relation"] + " " + full_df["value"]
    full_df["indicator"] = 1
    arc_df = full_df[["arc", "indicator"]].groupby("arc").count().reset_index()
    arc_df.rename(columns={"indicator":"arc_count"}, inplace=True)
    arc_df = arc_df.sort_values("arc_count", ascending=False).reset_index(drop=True)
    common_arc_df = arc_df.loc[arc_df["arc_count"] >= min_arc_count]
    common_arc_df = common_arc_df.merge(full_df, on="arc")
    common_arc_df = common_arc_df[list(input_full_df.columns) + ["arc_count"]] \
        .reset_index(drop=True)
    return common_arc_df

In [69]:
full_df = pd.read_csv("data/kg_df.csv")
known_df = pd.read_csv("data/known_df.csv")
year_df = known_df.loc[known_df["relation"] == "released on"].reset_index(drop=True)
year_df = year_df[["entity_id", "value"]].rename(columns={"value": "year_released"})
full_df = full_df.merge(year_df, on="entity_id").reset_index(drop=True)
small_kg_year_df = filter_arcs(full_df)
small_kg_year_df.to_csv("data/small_kg_year_df.csv")

In [92]:
t = filter_arcs(full_df, min_arc_count=1)

In [93]:
t.to_csv("data/kg_count_year_df.csv", index=False)

In [100]:
df = pd.read_csv("data/kg_df.csv")

In [95]:
t.loc[(t["year_released"] == '2017')&(t["arc_count"] >= 500)]

Unnamed: 0,entity_id,relation,value,year_released,arc_count
186,m/100_streets,is,good,2017,4848
764,m/222_2017,is,good,2017,4848
779,m/3_generations,is,good,2017,4848
784,m/47_meters_down,is,good,2017,4848
794,m/a_bad_moms_christmas,is,good,2017,4848
...,...,...,...,...,...
528431,m/the_glass_castle_2017,is,inevitable,2017,500
528444,m/the_mountain_between_us,is,inevitable,2017,500
528445,m/the_mummy_2017,is,inevitable,2017,500
528454,m/the_wall_2017,is,inevitable,2017,500


In [72]:
fuser = TransEFuser(embed_dim=3, max_epochs=2, n_splits=3)
toy_kg_df = fuser.fuse(toy_df[["entity_id", "relation", "value"]])

Resorted to Kfold splitter.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=273828.0), HTML(value='')))




KeyboardInterrupt: 

In [4]:
full_df.to_csv("kg_year_df.csv")

In [3]:
for year in np.sort(full_df["year_released"].unique())[::-1]:
    if year == '2017':
        df = full_df.loc[full_df["year_released"] == year].copy()
        fuser = TransEFuser(embed_dim=3, max_epochs=2, n_splits=3)
        kg_df = fuser.fuse(df[["entity_id", "relation", "value"]])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=103408.0), HTML(value='')))




KeyboardInterrupt: 

In [4]:
# Example of the TransE fusion method.
full_df = pd.read_csv("data/kg_df.csv")
entities = full_df["entity_id"].unique()
np.random.shuffle(entities)
df = kg_df.loc[kg_df["entity_id"].isin(entities[:10])]
fuser = TransEFuser(embed_dim=3, max_epochs=2, n_splits=3)
kg_df = fuser.fuse(kg_df)

Resorted to Kfold splitter.


KeyboardInterrupt: 

In [6]:
filtered_df = kg_df.loc[kg_df["probability"] > 0.75].reset_index(drop=True)
x = filtered_df[filtered_df["entity_id"] == "m/spiderman_2"]
for i in range(len(filtered_df[filtered_df["entity_id"] == "m/spiderman"])):
    r = x.iloc[i]
    print(r["relation"], r["value"], r["probability"])

96
features the theme these actors 0.8937830291416001
features the theme raging hormones who learning 0.956212739666196
features the theme the special effects 0.859557922791379
features the theme your jaw 0.9607189669955802
features the theme a good bit of harmless fun 0.7659165982224327
features the theme superhero movie 0.8005689801205209
features the theme the characters 0.8959785323063019
features the theme charisma 0.7859395262592798
features the theme an innocence 0.833366928861863
features the theme execution 0.8741530540379767
features the theme spiderman 0.9322083702765583
features the theme youth acquired a reputation as the superpower 0.9318223795897311
features the theme a whole movie about the hero 0.8751199524167534
features the theme an already announced sequel 0.9007807805174253
features the theme flicks 0.9381188582717788
features the theme the whole thing 0.9492898737282399
features the theme the spirit of its source material 0.802405313421382
features the theme top o

IndexError: single positional indexer is out-of-bounds

In [27]:
full_df = pd.read_csv("data/complete_df.csv")

In [28]:
full_df["arc"] = full_df["relation"] + " " + full_df["value"]
full_df["indicator"] = 1

In [29]:
full_df["arc"] = full_df["relation"] + " " + full_df["value"]
full_df["indicator"] = 1
full_df = full_df[["entity_id", "arc", "indicator"]].copy(deep=True)
arc_df = full_df[["arc", "indicator"]].groupby("arc").count().reset_index()
arc_df = arc_df.sort_values("indicator", ascending=False).reset_index(drop=True)
common_arc_df = arc_df.loc[arc_df["indicator"] > 25]

In [36]:
cluster_df = pd.DataFrame(columns=["entity_id"])
cluster_df["entity_id"] = full_df["entity_id"].unique()
arcs = common_arc_df["arc"].unique()
for arc in tqdm(arcs):
    entities = full_df.loc[full_df["arc"] == arc, "entity_id"].unique()
    cluster_df[arc] = 0
    cluster_df.loc[cluster_df["entity_id"].isin(entities), arc] = 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10499.0), HTML(value='')))




In [37]:
cluster_df.to_csv("data\cluster_df.csv")

In [None]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=9)
X = cluster_df.values
knn.fit(X)
distances, indices = knn.kneighbors(X)

In [34]:
entities = df["entity_id"].unique()
cluster_df = pd.DataFrame(columns=["entity_id", "arc", "indicator"])
for i in tqdm(range(0, len(entities), 100)):
    subset_df = df.loc[df["entity_id"].isin(entities[i:i+100])]
    new_df = subset_df.pivot(columns=["arc"], values=["indicator"])
    print(subset_df, new_df)
    cluster_df = pd.concat([cluster_df, new_df])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=177.0), HTML(value='')))

                   entity_id  \
0               m/+_one_2019   
1               m/+_one_2019   
2               m/+_one_2019   
3               m/+_one_2019   
4               m/+_one_2019   
...                      ...   
27211  m/10008507-deep_water   
27212  m/10008507-deep_water   
27213  m/10008507-deep_water   
27214  m/10008507-deep_water   
27215  m/10008507-deep_water   

                                                     arc  indicator  
0      features the miscellaneous theme Always Be My ...          1  
1         features the miscellaneous theme Four Weddings          1  
2      features the miscellaneous theme Four Weddings...          1  
3               features the miscellaneous theme Funeral          1  
4             features the miscellaneous theme Long Shot          1  
...                                                  ...        ...  
27211                                        is visceral          1  
27212                                       is watchabl

TypeError: Expected tuple, got str

In [38]:
full_df.pivot(columns=["relation"], )

Unnamed: 0_level_0,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator
arc,features the location 18th Century Britain,features the location 426,features the location 6,features the location A .,features the location Acheron,features the location Alabama,features the location Alamo,features the location Alpha Centauri,features the location Alt,features the location Alt-na-Shellach,...,is yellow,is young,is younger,is youthful,is yuletide,is zealous,is zemeckis,is zest,is zingy,is zippered
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27211,,,,,,,,,,,...,,,,,,,,,,
27212,,,,,,,,,,,...,,,,,,,,,,
27213,,,,,,,,,,,...,,,,,,,,,,
27214,,,,,,,,,,,...,,,,,,,,,,


In [53]:
df = pd.read_csv("data/known_df.csv")

In [54]:
df["arc"] = df["relation"] + " " + df["value"]
df["indicator"] = 1

In [55]:
df = df[["entity_id", "arc", "indicator"]].copy(deep=True)
df = df.groupby(["entity_id", "arc"]).count().reset_index()

In [56]:
df = df.drop_duplicates()

In [57]:
cluster_df = df.pivot(index=["entity_id"], columns=["arc"], values=["indicator"])
for c in cluster_df.columns:
    cluster_df[c] = cluster_df[c].fillna(0)

In [63]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=9)
X = cluster_df.values
knn.fit(X)
distances, indices = knn.kneighbors(X)

In [64]:
knn = NearestNeighbors(n_neighbors=9)
X = cluster_df.values
knn.fit(X)
distances, indices = knn.kneighbors(X)