#### Step 1: Load the Data Files

In [47]:
import pandas as pd

def load_movielens_data(data_folder):
    """
    Load MovieLens data from a specified folder (either 'ml-100k' or 'ml-1m').
    
    Args:
        data_folder (str): The folder name where the MovieLens data is located ('ml-100k' or 'ml-1m').
    
    Returns:
        tuple: A tuple containing DataFrames for ratings, movies, users, and genres.
    """
    # Define file paths based on the data folder
    ratings_file = f'{data_folder}/u.data' if data_folder == 'ml-100k' else f'{data_folder}/ratings.dat'
    movies_file = f'{data_folder}/u.item' if data_folder == 'ml-100k' else f'{data_folder}/movies.dat'
    users_file = f'{data_folder}/u.user' if data_folder == 'ml-100k' else f'{data_folder}/users.dat'
    genres_file = f'{data_folder}/u.genre' if data_folder == 'ml-100k' else None
    
    # Load ratings data
    if data_folder == 'ml-100k':
        ratings = pd.read_csv(ratings_file, sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    else:
        ratings = pd.read_csv(ratings_file, sep='::', names=['user_id', 'item_id', 'rating', 'timestamp'], engine='python')
    
    # Load movies data
    if data_folder == 'ml-100k':
        movies = pd.read_csv(movies_file, sep='|', names=['item_id', 'movie_title', 'release_date', 'video_release_date', 
                                                          'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 
                                                          'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 
                                                          'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
                                                          'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], 
                             encoding='latin-1')
    else:
        movies = pd.read_csv(movies_file, sep='::', names=['item_id', 'movie_title', 'genres'], engine='python', encoding='latin-1')
    
    # Load users data
    if data_folder == 'ml-100k':
        users = pd.read_csv(users_file, sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
    else:
        users = pd.read_csv(users_file, sep='::', names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')
    
    # Load genres data (only applicable to ml-100k)
    if data_folder == 'ml-100k':
        genres = pd.read_csv(genres_file, sep='|', names=['genre', 'genre_id'], usecols=[0], encoding='latin-1')
    else:
        genres = None
    
    return ratings, movies, users, genres


In [48]:
# Example usage:
ratings, movies, users, genres = load_movielens_data('ml-100k')

# Display the first few rows of each dataframe to understand the structure
print(ratings.head())
print(movies.head())
print(users.head())
if genres is not None:
    print(genres.head())

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596
   item_id        movie_title release_date  video_release_date  \
0        1   Toy Story (1995)  01-Jan-1995                 NaN   
1        2   GoldenEye (1995)  01-Jan-1995                 NaN   
2        3  Four Rooms (1995)  01-Jan-1995                 NaN   
3        4  Get Shorty (1995)  01-Jan-1995                 NaN   
4        5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDb_URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
4  h

#### Step 2: Construct the Knowledge Graph

In [49]:
import pandas as pd

def extract_kg_relationships_from_loaded_data(ratings, movies, users, genres=None):
    """
    Extract knowledge graph relationships from preloaded MovieLens data.
    
    Args:
        ratings (pd.DataFrame): DataFrame containing user-item interactions.
        movies (pd.DataFrame): DataFrame containing movie metadata.
        users (pd.DataFrame): DataFrame containing user information.
        genres (pd.DataFrame, optional): DataFrame containing genre information (only applicable for 'ml-100k').
    
    Returns:
        pd.DataFrame: A DataFrame containing the knowledge graph triples (head, relation, tail).
    """
    # Initialize an empty list to hold the knowledge graph triples
    kg_data = []

    # User-Movie Interactions (rated)
    for _, row in ratings.iterrows():
        kg_data.append((f'user_{row["user_id"]}', 'rated', f'movie_{row["item_id"]}'))

    # Movie-Genre Relationship (belongs_to_genre)
    if 'genres' in movies.columns:
        for _, row in movies.iterrows():
            movie_id = f'movie_{row["item_id"]}'
            genres_list = row['genres'].split('|')
            for genre in genres_list:
                kg_data.append((movie_id, 'belongs_to_genre', genre))
    elif genres is not None:
        for _, row in movies.iterrows():
            movie_id = f'movie_{row["item_id"]}'
            for genre in genres['genre']:
                if row[genre] == 1:
                    kg_data.append((movie_id, 'belongs_to_genre', genre))

    # User-Age Group Relationship (belongs_to_age_group)
    for _, row in users.iterrows():
        age_group = 'unknown'
        if row['age'] < 18:
            age_group = '0-17'
        elif row['age'] <= 30:
            age_group = '18-30'
        elif row['age'] <= 60:
            age_group = '31-60'
        else:
            age_group = '61+'
        
        kg_data.append((f'user_{row["user_id"]}', 'belongs_to_age_group', age_group))

    # User-Occupation Relationship (has_occupation)
    for _, row in users.iterrows():
        kg_data.append((f'user_{row["user_id"]}', 'has_occupation', row['occupation']))

    # User-Gender Relationship (has_gender)
    for _, row in users.iterrows():
        kg_data.append((f'user_{row["user_id"]}', 'has_gender', row['gender']))

    # Movie-Release Year Relationship (released_in)
    if 'release_date' in movies.columns:
        for _, row in movies.iterrows():
            if pd.notnull(row['release_date']):
                release_year = row['release_date'].split('-')[-1]
                kg_data.append((f'movie_{row["item_id"]}', 'released_in', release_year))
    else:
        for _, row in movies.iterrows():
            if pd.notnull(row['movie_title']):
                release_year = row['movie_title'][-5:-1]  # Extract the year from the title (assuming it's in parentheses)
                if release_year.isdigit():  # Ensure that it's a valid year
                    kg_data.append((f'movie_{row["item_id"]}', 'released_in', release_year))

    # Convert the list of triples into a DataFrame
    kg_df = pd.DataFrame(kg_data, columns=['head', 'relation', 'tail'])

    return kg_df



In [50]:

# Example usage:
kg_df = extract_kg_relationships_from_loaded_data(ratings, movies, users, genres)
print(kg_df.head())

       head relation       tail
0  user_196    rated  movie_242
1  user_186    rated  movie_302
2   user_22    rated  movie_377
3  user_244    rated   movie_51
4  user_166    rated  movie_346


In [51]:
kg_df.shape

(107403, 3)

In [52]:
kg_df.drop_duplicates(inplace = True)

In [53]:
kg_df.shape

(107403, 3)

In [55]:
kg_df['relation'].unique()

array(['rated', 'belongs_to_genre', 'belongs_to_age_group',
       'has_occupation', 'has_gender', 'released_in'], dtype=object)

#### Save KG to csv

In [54]:
# Export the knowledge graph DataFrame to a CSV file in the same folder
kg_df.to_csv('knowledge_graph_movies_100k.csv', index=False)

print("Knowledge graph saved to 'knowledge_graph.csv'")

Knowledge graph saved to 'knowledge_graph.csv'
