# Requirements

In [4]:
%%capture
%pip install numpy -q
%pip install pandas -q
%pip install matplotlib -q
%pip install networkx -q
%pip install torch -q
%pip install torch_geometric -q
%pip install tqdm -q
%pip install scipy -q
%pip install scikit-learn -q

In [3]:
# Standard library imports
import random
import time

# Third-party imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
import torch_geometric
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.utils import degree

from tqdm.notebook import tqdm
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

# Data exploration

### Unzip dataset

In [2]:
import zipfile

file = 'data/raw/ml-100k.zip'
target_dir = 'data/interim/'

with zipfile.ZipFile(file, 'r') as zip_ref:
    zip_ref.extractall(target_dir)

print(f"Successfully extracted to {target_dir}")

Successfully extracted to data/interim/


### Load dataset

In [None]:
import os
import pandas as pd

def load_data():
    ml_100k_folder = 'data/interim/ml-100k/'

    user_file = 'u.user'
    item_file = 'u.item'
    data_file = 'u.data'
    genre_file = 'u.genre'
    info_file = 'u.info'
    occupation_file = 'u.occupation'

    # column names
    user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
    item_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
    data_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

    # Load data into Pandas DataFrames
    users = pd.read_csv(os.path.join(ml_100k_folder, user_file), sep='|', names=user_cols)
    items = pd.read_csv(os.path.join(ml_100k_folder, item_file), sep='|', names=item_cols, encoding='latin-1')
    data = pd.read_csv(os.path.join(ml_100k_folder, data_file), sep='\t', names=data_cols)

    genre = pd.read_csv(os.path.join(ml_100k_folder, genre_file), sep='|', header=None, names=['genre_id', 'genre'])
    info = pd.read_csv(os.path.join(ml_100k_folder, info_file), sep=' ', header=None, names=['info'])
    occupation = pd.read_csv(os.path.join(ml_100k_folder, occupation_file), header=None, names=['occupation'])


    return users, items, data, genre, info, occupation


users_df, items_df, ratings_df, genre_df, info_df, occupation_df = load_data()

# Print the first few rows of each DataFrame to verify the data loading
print("Users DataFrame:")
print(users_df.head())

print("\nItems DataFrame:")
print(items_df.head())

print("\nRatings DataFrame:")
print(ratings_df.head())

print("\nGenre DataFrame:")
print(genre_df)

print("\nInfo DataFrame:")
print(info_df)

print("\nOccupation DataFrame:")
print(occupation_df)

# Basic data preprocessing