In [1]:
# MovieLens negative sampling

# https://petamind.com/create-bipartite-graph-from-a-rating-matrix/

from sklearn.datasets import dump_svmlight_file
import numpy as np
import pandas as pd
import os
import urllib
import zipfile
from sklearn.model_selection import train_test_split
import shutil
import matplotlib.pyplot as plt

RANDOM = 2021

In [2]:
datasets = {
    'ml-100k': '/home/weiss/rs_data/ml-100k/',
    'ml-latest-small': '/home/weiss/rs_data/ml-latest-small/',
    'ml-latest': '/home/weiss/rs_data/ml-latest/'
}

print('Available datasets: ', [key for key in datasets])
#dt = input('Dataset name = ')
#dt='ml-latest-small'
dt='ml-100k'

print('You selected {}'.format(dt))

Available datasets:  ['ml-100k', 'ml-latest-small', 'ml-latest']
You selected ml-100k


In [3]:
# Check structure
def list_files(startpath):
    print(startpath)
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))
dirs = [x[0] for x in os.walk(datasets[dt])]
ml = filter(lambda dirName: dirName if ('ml' in dirName) else '', list(dirs))
dt_dir_name= list(ml)[0]
print(list_files(dt_dir_name))

/home/weiss/rs_data/ml-100k/
/
    README
    u2.test
    u4.test
    u2.base
    allbut.pl
    u3.base
    u.data
    u.occupation
    ua.test
    u5.base
    u.genre
    u3.test
    ub.test
    mku.sh
    u.user
    u1.test
    ua.base
    u1.base
    u.item
    u5.test
    ub.base
    u4.base
    ml-100k.pkl
    u.info
processed/
    rX_val.csv
    rX_test_ns.csv
    rX_test.csv
    rX_train.csv
    rX_val_ns.csv
    rX_train_ns.csv
None


In [4]:
# Select ratings and tags
if dt=='ml-100k':
  ratings_data= pd.read_csv(dt_dir_name +'/'+ 'u.data', delimiter='\t', names=['userId', 'movieId', 'rating', 'timestamp'])
  user_data= pd.read_csv(dt_dir_name +'/'+ 'u.user', delimiter='|', names=['user id', 'age' ,'gender' ,'occupation' , 'zip code'])
  ratings_data.shape
elif dt=='ml-latest-small':
  ratings_data=pd.read_csv(dt_dir_name +'/'+ 'ratings.csv')
  tag_data=pd.read_csv(dt_dir_name +'/'+ 'tags.csv')
  ratings_data.shape
elif dt=='ml-latest':
  rating_data=pd.read_csv(dt_dir_name +'/'+ 'ratings.csv')
  tag_data=pd.read_csv(dt_dir_name +'/'+ 'tags.csv')
  rating_data.shape

In [5]:
# Check data
print(ratings_data.head())
print(ratings_data.info())
print('Ratings null?\n', ratings_data.isnull().any())
if dt == 'ml-100k':
    print(user_data.head())
    print('Users null?\n', user_data.isnull().any())
if dt == 'ml-latest-small' or dt == 'ml-latest':
    print(tag_data.head())
    print('Tags null?\n', tag_data.isnull().any())

   userId  movieId  rating  timestamp
0     196      242       3  881250949
1     186      302       3  891717742
2      22      377       1  878887116
3     244       51       2  880606923
4     166      346       1  886397596
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   userId     100000 non-null  int64
 1   movieId    100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB
None
Ratings null?
 userId       False
movieId      False
rating       False
timestamp    False
dtype: bool
   user id  age gender  occupation zip code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213
Users null?
 user i

In [6]:
# Create naive train, test, validation Split (80/10/10)
# for original ratings

train, test = train_test_split(ratings_data[['userId', 'movieId', 'rating']], test_size=0.2, random_state = RANDOM)
test, val = train_test_split(test, test_size=0.5, random_state = RANDOM)
print(train.shape, test.shape, val.shape)

(80000, 3) (10000, 3) (10000, 3)


In [7]:
def save_to_csv(data_frames, out_file_names, outdir=dt_dir_name+'processed/'):
    """
    Save Train, Test, Validation Split to csv files
    """
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    assert len(data_frames)==len(out_file_names), "number of dataframes must equal number of file names"
    for i in range(len(out_file_names)):
        data_frames[i].to_csv(outdir+out_file_names[i], header=None, index=False)

In [8]:
# Check negative sampling train dataset
print(train.head())

# Save train, test, validation to CSV
save_to_csv([train, test, val], ['train.csv', 'test.csv', 'val.csv'] )

       userId  movieId  rating
10700     269      401       3
38152      90      497       5
87439     925      447       4
52613     450      832       2
93091     901       88       5


In [None]:
# Negative Sampling
# Create implicit data (interaction or not)
import random
import time

def negative_sampling(r):
    """
    Create negative samples, i.e. user-movie combinations with no ratings
    
    parameters:
    input rating data -- pd dataframe: userId, movieId, rating
    
    returns:
    negative sampled set -- pd dataframe: userId, movieId, interact (implicit)
    """
    users = r['userId'].drop_duplicates()  # get unique userIds
    movies = r['movieId'].drop_duplicates()  # get unique movieIds
    print('# users', len(users))
    print('# movies', len(movies))
    negative_samples = r[['userId', 'movieId', 'timestamp']]  # starting point
    negative_samples['interact'] = negative_samples.apply(lambda row: 1, axis=1)
    negative_tmp_data = []  # list to craft interaction [userId, movieId, timestamp, interaction]
    start_time = time.time()
    stop_time = time.time()
    for i, row in r.iterrows():
        user = row['userId']  # current user
        if i%5000==0:
            stop_time = time.time()
            print('user', user, 'processed ... {0:0.2f}% after {1:0.2f} sec'.format(float(i)*100.0 / len(r), stop_time - start_time))
            start_time = stop_time
        j = 2  # amount of negative samples
        while j > 0:
            movie = movies.sample(n=1).values[0]  # get random movieId
            # insert negative samples if u-m relation does not exist
            if (not ((negative_samples['userId'] == user) & (negative_samples['movieId'] == movie)).any()):
                j -= 1
                negative_tmp_data.append([user, movie, int(time.time()), -1])
    negative_temp_df = pd.DataFrame(data=negative_tmp_data, columns=['userId', 'movieId', 'timestamp', 'interact'])
    negative_samples = pd.concat([negative_samples, negative_temp_df], ignore_index=True)
    return negative_samples

ns = negative_sampling(ratings_data)
print(ns.head())

# users 943
# movies 1682
user 196 processed ... 0.00% after 0.00 sec


In [None]:
# Create Train, Test, Validation Split (80/10/10)
# for negative samples
train_ns, test_ns = train_test_split(ns, test_size=0.2, random_state = RANDOM)
test_ns, val_ns = train_test_split(test_ns, test_size=0.5, random_state= RANDOM)
print(train_ns.shape, test_ns.shape, val_ns.shape)

# Check negative sampling train dataset
train_ns.head()

# Save negative sampling datasets
save_to_csv([train_ns, test_ns, val_ns], ['train_ns.csv', 'test_ns.csv', 'val_ns.csv'] )