In [23]:
import pandas as pd
from pathlib import Path

# Read Movie Lens 32M dataset 
path = Path.cwd()
movies = pd.read_csv(path/"ml-32m/movies.csv",low_memory=False)
links = pd.read_csv(path/"ml-32m/links.csv",low_memory=False)
tags = pd.read_csv(path/"ml-32m/tags.csv",low_memory=False)
ratings = pd.read_csv(path/"ml-32m/ratings.csv",low_memory=False)

# Filter movies that have tags
movies_withtags = tags['movieId'].unique()
filtered_movies = movies[movies['movieId'].isin(movies_withtags)]
filtered_ratings = ratings[ratings['movieId'].isin(movies_withtags)]

# Group tags for each movie, and remove repeated ones
tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x:list(set(x))).reset_index()
tags_grouped.columns = ['movieId', 'review_text']

# calculate average ratings for each movie
avg_ratings = filtered_ratings.groupby('movieId')['rating'].mean().reset_index()
avg_ratings.columns = ['movieId', 'average_rating']

# Merge grouped tags and average ratings with movie
result = filtered_movies.merge(tags_grouped, on='movieId', how='left')
result = result.merge(avg_ratings, on='movieId', how='left')
result = result[['movieId', 'title', 'genres', 'review_text', 'average_rating']]

# Drop movies without ratings
result = result.dropna(subset=['average_rating'])

# Transfer review_text format to the same as genres
result['review_text'] = result['review_text'].apply(
lambda x: '|'.join([str(i) for i in x]) if isinstance(x, list) else ''
)

# Output Cleaned reviews to .csv
result.to_csv('cleaned_reviews.csv', index=False)
