In [6]:
import pandas as pd

# Load the dataset
file_path = './IMDB-Movie-Data.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [7]:
def transform_to_kg(df):
    kg_data = []

    # Iterate over each row in the dataframe
    for index, row in df.iterrows():
        title = row['Title']

        # Handle genres
        genres = row['Genre'].split(',')
        for genre in genres:
            kg_data.append([title, 'genre', genre.strip()])
            kg_data.append([title, 'movie_in_genre', genre.strip()])

        # Handle director
        kg_data.append([title, 'directed_by', row['Director']])
        kg_data.append([title, 'directed', row['Director']])

        # Handle actors
        actors = row['Actors'].split(',')
        for actor in actors:
            kg_data.append([title, 'acted_by', actor.strip()])
            kg_data.append([actor.strip(), 'acted_in', title])

        # Handle year
        kg_data.append([title, 'released_in', str(row['Year'])])
        kg_data.append([str(row['Year']), 'release', title])

        # Handle description
        kg_data.append([title, 'described_as', row['Description']])
        kg_data.append([row['Description'], 'description_of', title])

        # Other attributes can be added similarly

    return pd.DataFrame(kg_data, columns=['entity_1', 'relationship', 'entity_2'])

# Transform the dataset with additional relations
kg_df = transform_to_kg(df)


In [8]:
import json

In [9]:
# Correcting the function to return the mappings and then displaying a subset of them

def create_and_save_label_encodings(kg_df, entity_filename, relation_filename):
    # Create label encoders for entities and relationships
    entity_encoder = LabelEncoder()
    relation_encoder = LabelEncoder()

    # Fit label encoders
    entities = pd.concat([kg_df['entity_1'], kg_df['entity_2']]).unique()
    relationships = kg_df['relationship'].unique()
    entity_encoder.fit(entities)
    relation_encoder.fit(relationships)

    # Create mappings and convert values to strings for JSON serialization
    entity_mapping = {key: str(value) for key, value in zip(entity_encoder.classes_, entity_encoder.transform(entity_encoder.classes_))}
    relation_mapping = {key: str(value) for key, value in zip(relation_encoder.classes_, relation_encoder.transform(relation_encoder.classes_))}

    # Save mappings as JSON
    with open(entity_filename, 'w') as file:
        json.dump(entity_mapping, file)
    with open(relation_filename, 'w') as file:
        json.dump(relation_mapping, file)

    return entity_encoder, relation_encoder, entity_mapping, relation_mapping

# Create and save label encodings
entity_file = './entity.json'
relation_file = './relation.json'
entity_encoder, relation_encoder, entity_mapping, relation_mapping = create_and_save_label_encodings(kg_df, entity_file, relation_file)

# Display part of the entity and relationship mappings
# Note: For demonstration, we'll show a small subset of the mappings
list(entity_mapping.items())[:5], list(relation_mapping.items())


([('"21" is the fact-based story about six MIT students who were trained to become experts in card counting and subsequently took Vegas casinos for millions in winnings.',
   '0'),
  ('"Mr. Church" tells the story of a unique friendship that develops when a little girl and her dying mother retain the services of a talented cook - Henry Joseph Church. What begins as a six month arrangement instead spans into fifteen years and creates a family bond that lasts forever.',
   '1'),
  ('"The Thinning" takes place in a post-apocalyptic future where population control is dictated by a high-school aptitude test. When two students (Logan Paul and Peyton List) discover the test... See full summary »',
   '2'),
  ("#1 NASCAR driver Ricky Bobby stays atop the heap thanks to a pact with his best friend and teammate, Cal Naughton, Jr. But when a French Formula One driver, makes his way up the ladder, Ricky Bobby's talent and devotion are put to the test.",
   '3'),
  ('(500) Days of Summer', '4')],
 

In [10]:
# Transform the knowledge graph to use unique IDs with the updated dataset
kg_id_df = kg_df.copy()
kg_id_df['entity_1'] = entity_encoder.transform(kg_df['entity_1'])
kg_id_df['relationship'] = relation_encoder.transform(kg_df['relationship'])
kg_id_df['entity_2'] = entity_encoder.transform(kg_df['entity_2'])

kg_main_file = './kg_main.csv'
kg_id_file = './kg_id.csv'
# Export the updated original and transformed knowledge graph
kg_df.to_csv(kg_main_file, index=False)
kg_id_df.to_csv(kg_id_file, index=False)

# Display the first few rows of the updated ID-based knowledge graph
kg_id_df.head(), kg_main_file, kg_id_file, entity_file, relation_file

(   entity_1  relationship  entity_2
 0      1791             6       405
 1      1791             7       405
 2      1791             6       431
 3      1791             7       431
 4      1791             6      3615,
 './kg_main.csv',
 './kg_id.csv',
 './entity.json',
 './relation.json')