In [None]:
#get a lean version of the movieLens-latest dataset

In [1]:
import os
import zipfile
import wget

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
if (not os.path.exists('./ml-latest.zip')):
    _ = wget.download('http://files.grouplens.org/datasets/movielens/ml-latest.zip')
if (not os.path.isdir('./ml-latest')):
    zip_ref = zipfile.ZipFile('ml-latest.zip','r')
    zip_ref.extractall('./')
    zip_ref.close()

100% [......................................................................] 277113433 / 277113433

In [4]:
fpath = './ml-latest/'

## Movies

In [5]:
df_movie = pd.read_csv(os.path.join(fpath,'movies.csv'))

In [6]:
df_movie.head(20)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


## Ratings

In [7]:
min_movie_rating = 50
min_user_rating = 800

In [8]:
print('loading raw rating data..',end='')
rating_df = pd.read_csv(os.path.join(fpath,'ratings.csv'))
print('Done\n {} total rating record loaded\n'.format(rating_df.shape[0]))

#aggregated by user
rating_by_user_df = rating_df.groupby('userId', as_index=False).agg({'movieId':[('num_movie_rated','count')]})
rating_by_user_df.columns.set_levels([item if item!='' else 'userId' for item in rating_by_user_df.columns.levels[1]],level=1,inplace=True)
rating_by_user_df.columns = rating_by_user_df.columns.droplevel(0) #drop first level column name (old column name before aggregation)

print('{} users found'.format(rating_by_user_df.shape[0]))
print('Raw Rating information aggregated by user\n {}\n'.format(rating_by_user_df.describe()))

rating_by_user_df = rating_by_user_df[rating_by_user_df['num_movie_rated']>=min_user_rating].reset_index(drop=True)

print('{} users kept'.format(rating_by_user_df.shape[0]))
print('rating information\n {}\n'.format(rating_by_user_df.describe()))

rated_userIds = rating_by_user_df['userId'].unique()

#remove rating record of the deleted userId
rating_df = rating_df[rating_df['userId'].isin(rated_userIds)].reset_index(drop=True)
print('{} rating information kept after inactive userIds removed\n'.format(rating_df.shape[0]))

#aggregated by movie
rating_by_movie_df = rating_df.groupby('movieId', as_index = False).agg({'userId': [('num_rating', 'count')]})
rating_by_movie_df.columns.set_levels([item if item!='' else 'movieId' for item in rating_by_movie_df.columns.levels[1]],level=1,inplace=True)
rating_by_movie_df.columns = rating_by_movie_df.columns.droplevel(0)#drop first level column name (old column name before aggregation)

nMovie_raw = rating_by_movie_df.shape[0]

print('{} movies found'.format(rating_by_movie_df.shape[0]))
print('Raw rating information aggregated by movie\n {}\n'.format(rating_by_movie_df.describe()))

rating_by_movie_df = rating_by_movie_df[rating_by_movie_df['num_rating']>min_movie_rating].reset_index(drop=True)

rated_movieIds = rating_by_movie_df['movieId'].unique()

#remove rating record of the deleted movieId
rating_df = rating_df[rating_df['movieId'].isin(rated_movieIds)].reset_index(drop=True)
print('{} movies kept'.format(rating_by_movie_df.shape[0]))
print('{} rating information kept after rarely rated movies are removed\n'.format(rating_df.shape[0]))


loading raw rating data..Done
 27753444 total rating record loaded

283228 users found
Raw Rating information aggregated by user
               userId  num_movie_rated
count  283228.000000    283228.000000
mean   141614.500000        97.989761
std     81761.025358       212.760722
min         1.000000         1.000000
25%     70807.750000        15.000000
50%    141614.500000        30.000000
75%    212421.250000        95.000000
max    283228.000000     23715.000000

4380 users kept
rating information
               userId  num_movie_rated
count    4380.000000      4380.000000
mean   141220.451142      1303.782192
std     81310.354156       728.820895
min        81.000000       800.000000
25%     71052.750000       918.000000
50%    139743.000000      1102.500000
75%    210135.250000      1412.000000
max    283195.000000     23715.000000

5710566 rating information kept after inactive userIds removed

50407 movies found
Raw rating information aggregated by movie
              movieId 

In [12]:
n_movie_kept = rating_by_movie_df.shape[0]
n_user_kept = rating_by_user_df.shape[0]

In [13]:
print('rating matrix density = %8.2f%%' % (100*rating_df.shape[0]/n_movie_kept/n_user_kept))

rating matrix density =    11.52%


In [14]:
userId_old2new = dict(zip(rated_userIds,list(range(len(rated_userIds)))))
movieId_old2new = dict(zip(rated_movieIds,list(range(len(rated_movieIds)))))

## Export Lean version of MovieLens latest dataset

In [15]:
if (not os.path.isdir('./ml-latest-lean')):
    os.makedirs('./ml-latest-lean')
flean_path = './ml-latest-lean'

### export movies.csv

In [16]:
df_movie = pd.read_csv(os.path.join(fpath,'movies.csv'))
df_movie = df_movie[df_movie['movieId'].isin(rated_movieIds)].reset_index(drop=True)
df_movie['movieId'] = df_movie['movieId'].apply(lambda x: movieId_old2new[x])

In [17]:
df_movie['title'] = df_movie['title'].apply(lambda x: x.strip())
df_movie['title'] = df_movie['title'].apply(lambda x: x if x[-1]!=')' else x[:-7])

In [21]:
df_movie.to_csv(os.path.join(flean_path,'movies.csv'), index = False)

### export links.csv

In [22]:
df_links = pd.read_csv(os.path.join(fpath,'links.csv'))
df_links = df_links[df_links['movieId'].isin(rated_movieIds)].reset_index(drop=True)
df_links['movieId'] = df_links['movieId'].apply(lambda x: movieId_old2new[x])

In [23]:
df_links.to_csv(os.path.join(flean_path, 'links.csv'), index = False)

### export tags.csv

In [25]:
df_tags = pd.read_csv(os.path.join(fpath,'tags.csv'))
df_tags = df_tags[df_tags['movieId'].isin(rated_movieIds)].reset_index(drop=True)
df_tags = df_tags[df_tags['userId'].isin(rated_userIds)].reset_index(drop=True)

df_tags['movieId'] = df_tags['movieId'].apply(lambda x: movieId_old2new[x])
df_tags['userId'] = df_tags['userId'].apply(lambda x: userId_old2new[x])

In [26]:
df_tags.to_csv(os.path.join(flean_path,'tags.csv'), index = False)

In [27]:
df_tags.to_pickle(os.path.join(flean_path,'tags.pickle'))

### export ratings.csv

In [28]:
rating_df['movieId'] = rating_df['movieId'].apply(lambda x: movieId_old2new[x])
rating_df['userId'] = rating_df['userId'].apply(lambda x: userId_old2new[x])

In [38]:
rating_df.to_csv(os.path.join(flean_path,'ratings.zip'), index = False, compression='gzip')

### export readme.md

In [44]:
with open(os.path.join(flean_path,'readme.md'),'w') as f:
    f.write('# Lean Version of MovieLens-latest dataset\n')
    f.write(f'''This movielens dataset is created based on the ml-latest dataset. 
    Only the users that give >{min_user_rating} rating and movies that received >{min_movie_rating} rating are kept.
    After cleaning, {rating_df.shape[0]} rating information for {n_movie_kept} movies by {n_user_kept} users are kept.
    The rating matrix density is {100*rating_df.shape[0]/n_movie_kept/n_user_kept}%.\n''')
    f.write('\n\n\n')
    f.write('## Load data\n')
    f.write('movies.csv, tags.csv and links.csv are saved in plain csv format and can be directly.\n')
    f.write('ratings.zip is saved as a gzip csv file. It can be read by two methods:\n')
    f.write('* unzip the file and obtain a plain csv file, and read the csv file.\n')
    f.write('* use pandas.read_csv with the keyword compression set to be "gzip"\n')
    