In [4]:
import polars as pl

movies_pl = pl.scan_csv('movies.csv')
ratings_pl = pl.scan_csv('ratings.csv')
tags_pl = pl.scan_csv('tags.csv')


In [5]:
ratings_pl = ratings_pl.with_columns(
    pl.from_epoch(pl.col('timestamp'), time_unit='s').alias('datetime_timestamp')
).with_columns(
    pl.col('datetime_timestamp').dt.year().alias('year'),
    pl.col('datetime_timestamp').dt.month().alias('month')
).lazy()


In [7]:
# a. Show the aggregated number of ratings per year

ratings_by_year = ratings_pl.group_by('year').agg(
    pl.count('rating').alias('# ratings')
).sort('year', descending=True)

ratings_by_year.collect().head(20)

year,# ratings
i32,u32
2018,6418
2017,8198
2016,6703
2015,6616
2014,1439
…,…
2003,4014
2002,3478
2001,3922
2000,10061


In [8]:
# b. Show the average monthly number of ratings

avg_monthly = ratings_pl.group_by('month', 'year').agg(
    pl.mean('rating').alias('# ratings')
).sort(['year', 'month'], descending=True)

avg_monthly.collect().head(20)


month,year,# ratings
i8,i32,f64
9,2018,3.568709
8,2018,3.557762
7,2018,4.010239
6,2018,3.979714
5,2018,2.95163
…,…,…
6,2017,2.959424
5,2017,3.480184
4,2017,3.626219
3,2017,3.051002


In [None]:
# c. Show the rating levels distribution

In [9]:
# d. Show the 18 movies that are tagged but not rated

tags_pl_joined = tags_pl.join(ratings_pl, on='movieId', how='left').filter(pl.col('rating').is_null())
tags_pl_joined.join(movies_pl, on='movieId', how='inner').select(pl.col('title')).unique().sort('title').collect()

title
str
"""Browning Version, The (1951)"""
"""Call Northside 777 (1948)"""
"""Chalet Girl (2011)"""
"""Chosen, The (1981)"""
"""Color of Paradise, The (Rang-e…"
…
"""Road Home, The (Wo de fu qin m…"
"""Roaring Twenties, The (1939)"""
"""Scrooge (1970)"""
"""This Gun for Hire (1942)"""


In [10]:
# e. Show the movies that have rating but no tag

#.select('movieId').unique()
ratings_pl_joined = ratings_pl.join(tags_pl, on='movieId', how='left').filter(pl.col('tag').is_null())
ratings_pl_joined.join(movies_pl, on='movieId', how='inner').select(pl.col('title')).unique().sort('title').collect().head(20)

title
str
"""'71 (2014)"""
"""'Hellboy': The Seeds of Creati…"
"""'Round Midnight (1986)"""
"""'Salem's Lot (2004)"""
"""'Til There Was You (1997)"""
…
"""10 Years (2011)"""
"""10,000 BC (2008)"""
"""100 Girls (2000)"""
"""100 Streets (2016)"""


In [11]:
# f. Focusing on the rated untagged movies with more than 30 user ratings,show the top 10 movies in terms of average rating and number of ratings

rated_untagged = ratings_pl_joined.group_by('movieId').agg(
    pl.count('rating').alias('# ratings'),
    pl.mean('rating').alias('avg_ratings')
).filter(
    pl.col('# ratings') > 30
)

rated_untagged = rated_untagged.join(movies_pl, on='movieId', how='inner').select(pl.col('title'), pl.col('avg_ratings'), pl.col('# ratings'))
rated_untagged.sort('avg_ratings', descending=True).collect().head(10)
rated_untagged.sort('# ratings', descending=True).collect().head(10)


title,avg_ratings,# ratings
str,f64,u32
"""American Beauty (1999)""",4.056373,204
"""Ace Ventura: Pet Detective (19…",3.040373,161
"""Mask, The (1994)""",3.184713,157
"""Die Hard (1988)""",3.862069,145
"""Die Hard: With a Vengeance (19…",3.555556,144
"""Groundhog Day (1993)""",3.944056,143
"""Dumb & Dumber (Dumb and Dumber…",3.06015,133
"""GoldenEye (1995)""",3.496212,132
"""Monsters, Inc. (2001)""",3.871212,132
"""Austin Powers: The Spy Who Sha…",3.198347,121


In [12]:
# g. What is the average number of tags per movie in tagsDF? And the average number of tags per user? How does it compare with the average number of tags a user assigns to a movie?

In [13]:
# h. Identify the users that tagged movies without rating them

tags_pl_joined = tags_pl.join(ratings_pl, on='movieId', how='left').filter(pl.col('rating').is_null())
tags_pl_joined.select(pl.col('userId')).unique().collect()

userId
i64
318
474
543
288


In [14]:
# i. What is the average number of ratings per user in ratings DF? And the average number of ratings per movie?

ratings_pl = ratings_pl.filter(pl.col('rating').is_not_null())

ratings_pl.select(
    'userId',
    pl.col('rating').count().alias('# ratings'),
    pl.col('userId').unique().count().alias('# users')
).select(
    (pl.col('# ratings') / pl.col('# users')).round(3).alias('avg_no_of_ratings_per_user')
).collect()

ratings_pl.select(
    'userId',
    pl.col('rating').count().alias('# ratings'),
    pl.col('movieId').unique().count().alias('# users')
).select(
    (pl.col('# ratings') / pl.col('# users')).round(3).alias('avg_no_of_ratings_per_user')
).collect()


avg_no_of_ratings_per_user
f64
10.37


In [15]:
# j. What is the predominant (frequency based) genre per rating level?

'''
ratings_pl.join(movies_pl, on='movieId', how='left').select(
    'movieId',
    'rating',
    'genres'
).group_by(["rating", "genres"]).agg(
    pl.count("genres").alias("genre_count")
).sort(by=["rating", "genre_count"]).unique(subset="rating").collect()
'''

merged_df = ratings_pl.join(movies_pl, on='movieId', how='left').select(
    'movieId',
    'rating',
    'genres'
)

# Step 2: Split the 'genres' column by '|' into a list
merged_df = merged_df.with_columns(
    pl.col("genres").str.split("|")
)

# Step 3: Explode the genres column to have one genre per row
exploded_df = merged_df.explode("genres")

# Step 4: Group by 'rating' and 'genre' to count occurrences
genre_counts = exploded_df.group_by(["rating", "genres"]).agg(
    pl.count("genres").alias("genre_count")
)

# Step 5: Find the predominant genre for each rating level
# - Sort by 'rating' and 'genre_count' in descending order
# - Use 'unique' to ensure we get the top genre for each rating

predominant_genres = genre_counts.sort(by=["rating", "genre_count"]).unique(subset="rating")

# Display the result
print(predominant_genres.collect())


shape: (10, 3)
┌────────┬────────────────────┬─────────────┐
│ rating ┆ genres             ┆ genre_count │
│ ---    ┆ ---                ┆ ---         │
│ f64    ┆ str                ┆ u32         │
╞════════╪════════════════════╪═════════════╡
│ 0.5    ┆ (no genres listed) ┆ 2           │
│ 1.0    ┆ (no genres listed) ┆ 2           │
│ 1.5    ┆ Documentary        ┆ 2           │
│ 2.0    ┆ (no genres listed) ┆ 2           │
│ 2.5    ┆ (no genres listed) ┆ 6           │
│ 3.0    ┆ (no genres listed) ┆ 6           │
│ 3.5    ┆ (no genres listed) ┆ 6           │
│ 4.0    ┆ (no genres listed) ┆ 8           │
│ 4.5    ┆ (no genres listed) ┆ 8           │
│ 5.0    ┆ (no genres listed) ┆ 7           │
└────────┴────────────────────┴─────────────┘
