In [1]:
!pwd

/home/teang1995/codes/pytorch-autorec/explore


In [2]:
import os
import pandas as pd
import numpy as np

In [8]:
data_dir = '/home/teang1995/codes/pytorch-autorec/data'
ml1M_dir = os.path.join(data_dir, 'movielens-1M')

movie_path = os.path.join(ml1M_dir, 'movies.dat')
rating_path = os.path.join(ml1M_dir, 'ratings.dat')
user_path = os.path.join(ml1M_dir, 'users.dat')

# read movie file
movie_cols = ['movie_id', 'title', 'genres']
movies = pd.read_table(movie_path,
                       sep='::',
                       header=None,
                       names=movie_cols,
                       engine='python',
                       encoding='ISO-8859-1')

# read rating file
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(rating_path,
                      sep='::',
                      names=rating_cols,
                      engine='python',
                      encoding='ISO-8859-1')

# read user file
user_cols = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(user_path,
                      sep='::',
                      names=user_cols,
                      engine='python',
                      encoding='ISO-8859-1')



In [9]:
movies.head(5)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [11]:
users.head(5)

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [12]:
len_movies = len(movies)
len_movies

3883

In [13]:
len_users = len(users)
len_users

6040

In [16]:
model_type = 'user'

if model_type == 'item':
    interactions = np.zeros((len_movies, len_users))
else:
    interactions = np.zeros((len_users, len_movies))
    
interactions.shape

(6040, 3883)

### for I-AutoRec

In [25]:
df = pd.pivot_table(ratings, index=['movie_id'], values=['rating'], columns='user_id', fill_value=0)

In [26]:
df

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,5,0,0,0,0,4,0,4,5,5,...,0,4,0,0,4,0,0,0,0,3
2,0,0,0,0,0,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,2,2,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0,0,0,0,0,0,0,0,3,4,...,0,0,0,0,0,0,0,0,0,0
3949,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3950,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3951,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3706 entries, 1 to 3952
Columns: 6040 entries, ('rating', 1) to ('rating', 6040)
dtypes: int64(6040)
memory usage: 170.8 MB


### for U-AutoRec

In [28]:
df = pd.pivot_table(ratings, index=['user_id'], values=['rating'], columns='movie_id', fill_value=0)

In [31]:
df

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0,0,0,2,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6040 entries, 1 to 6040
Columns: 3706 entries, ('rating', 1) to ('rating', 3952)
dtypes: int64(3706)
memory usage: 170.8 MB
