In [1]:
import pandas as pd

In [8]:
df_ratings = pd.read_csv('data/original/ratings.csv')
df_personality = pd.read_csv('data/original/personality-data.csv')

Preprocess Ratings dataset

In [9]:
df_ratings.head()

Unnamed: 0,useri,movie_id,rating,tstamp
0,8e7cebf9a234c064b75016249f2ac65e,1,5.0,2001-09-10 17:19:56
1,8e7cebf9a234c064b75016249f2ac65e,2,4.0,2001-09-28 11:34:55
2,8e7cebf9a234c064b75016249f2ac65e,3,4.0,2001-09-28 11:42:50
3,8e7cebf9a234c064b75016249f2ac65e,5,5.0,2001-09-28 11:27:30
4,8e7cebf9a234c064b75016249f2ac65e,6,4.0,2002-01-07 18:12:02


In [10]:
# Remove unnecessary columns
df_ratings = df_ratings.drop(columns=[' tstamp '])

In [11]:
# Clean up column names
df_ratings.rename(columns={"useri": "raw_user_id", " movie_id": "movie_id", " rating": "rating"}, inplace=True)

In [12]:
# Map raw_user_id to integer values that is suitable for PyTorch
user_mapping = { user_id: i for i, user_id in enumerate(df_ratings['raw_user_id'].unique())}
movie_mapping = { movie_id: i for i, movie_id in enumerate(df_ratings['movie_id'].unique())}
df_ratings['user_id'] = df_ratings['raw_user_id'].map(user_mapping)
df_ratings['movie_id'] = df_ratings['movie_id'].map(movie_mapping)

In [7]:
# Drop raw_user_id column
df_ratings = df_ratings.drop(columns=['raw_user_id'])

In [8]:
# Divide the dataset into train and test sets
from sklearn.model_selection import train_test_split

df_ratings_train, df_ratings_test = train_test_split(df_ratings, test_size=0.2, random_state=42)

In [9]:
# Remove rows in test set where user_id is not in train set
df_ratings_test = df_ratings_test[df_ratings_test['user_id'].isin(df_ratings_train['user_id'])]

In [10]:
# Save the modified ratings and train and test sets as CSV files in data/preprocessed
df_ratings.to_csv('data/preprocessed/ratings.csv', index=False)
df_ratings_train.to_csv('data/preprocessed/ratings-train.csv', index=False)
df_ratings_test.to_csv('data/preprocessed/ratings-test.csv', index=False)

Preprocess Personality dataset

In [13]:
# Extract columns userid, openness, agreeableness, emotional_stability, conscientiousness, extraversion from df_personality
df_personality = df_personality[['userid', ' openness', ' agreeableness', ' emotional_stability', ' conscientiousness', ' extraversion']]
df_personality.head()

Unnamed: 0,userid,openness,agreeableness,emotional_stability,conscientiousness,extraversion
0,8e7cebf9a234c064b75016249f2ac65e,5.0,2.0,3.0,2.5,6.5
1,77c7d756a093150d4377720abeaeef76,7.0,4.0,6.0,5.5,4.0
2,b7e8a92987a530cc368719a0e60e26a3,4.0,3.0,4.5,2.0,2.5
3,92561f21446e017dd6b68b94b23ad5b7,5.5,5.5,4.0,4.5,4.0
4,030001ac2145a938b07e686a35a2d638,5.5,5.5,3.5,4.5,2.5


In [14]:
df_personality.columns

Index(['userid', ' openness', ' agreeableness', ' emotional_stability',
       ' conscientiousness', ' extraversion'],
      dtype='object')

In [15]:
# Rename columns to match the OCEAN model names
df_personality.rename(columns={
	"userid": "raw_user_id",
	" openness": "openness", 
	" conscientiousness": "conscientiousness",
	" extraversion": "extraversion",
	" emotional_stability": "neuroticism", 
	" agreeableness": "agreeableness",
	}, 
	inplace=True)

In [16]:
df_personality.head()

Unnamed: 0,raw_user_id,openness,agreeableness,neuroticism,conscientiousness,extraversion
0,8e7cebf9a234c064b75016249f2ac65e,5.0,2.0,3.0,2.5,6.5
1,77c7d756a093150d4377720abeaeef76,7.0,4.0,6.0,5.5,4.0
2,b7e8a92987a530cc368719a0e60e26a3,4.0,3.0,4.5,2.0,2.5
3,92561f21446e017dd6b68b94b23ad5b7,5.5,5.5,4.0,4.5,4.0
4,030001ac2145a938b07e686a35a2d638,5.5,5.5,3.5,4.5,2.5


In [17]:
# Map raw_user_id to integer values used in df_ratings
df_personality['user_id'] = df_personality['raw_user_id'].map(user_mapping)
df_personality.head()

Unnamed: 0,raw_user_id,openness,agreeableness,neuroticism,conscientiousness,extraversion,user_id
0,8e7cebf9a234c064b75016249f2ac65e,5.0,2.0,3.0,2.5,6.5,0
1,77c7d756a093150d4377720abeaeef76,7.0,4.0,6.0,5.5,4.0,1
2,b7e8a92987a530cc368719a0e60e26a3,4.0,3.0,4.5,2.0,2.5,2
3,92561f21446e017dd6b68b94b23ad5b7,5.5,5.5,4.0,4.5,4.0,3
4,030001ac2145a938b07e686a35a2d638,5.5,5.5,3.5,4.5,2.5,4


In [18]:
# Drop raw_user_id and save the dataset as CSV file in data/preprocessed
df_personality = df_personality.drop(columns=['raw_user_id'])
df_personality.to_csv('data/preprocessed/personality.csv', index=False)

In [20]:
# Merge personality into train and test dataset
df_ratings_train = pd.merge(df_ratings_train, df_personality, on='user_id')
df_ratings_test = pd.merge(df_ratings_test, df_personality, on='user_id')

# Save them
df_ratings_train.to_csv('data/preprocessed/ratings-train.csv', index=False)
df_ratings_test.to_csv('data/preprocessed/ratings-test.csv', index=False)

Personality test with user_id in rating-test

In [2]:
df_personality = pd.read_csv('data/preprocessed/personality.csv')
df_ratings_test = pd.read_csv('data/preprocessed/ratings-test.csv')

# Remove rows from df_personality which have user_id which are not present in df_rating_test
df_personality = df_personality[df_personality['user_id'].isin(df_ratings_test['user_id'])]
df_personality.to_csv('data/preprocessed/personality-test.csv', index=False)
