## Let's begin in this end instead.

In [4]:
import polars as pl

polars=pl.read_csv('../Data/ml-latest/movies.csv',
    columns=['movieId','genres'],
    dtypes={
        'movieId':pl.Int32,
        'genres':pl.Utf8,
        }
    )

In [5]:
%%timeit
speed = pl.scan_csv("../Data/ml-latest/ratings.csv")
speed.fetch(5)

468 µs ± 2.69 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


The pl.scan_csv method won't actually read in all the data into memory. If we then call .fetch(5) we get 5 random rows of data.<br> Because there's no concern about ordering we don't need to load in all the data which is why it is ridiculously fast.

In [8]:
ratings = pl.scan_csv("../Data/ml-latest/ratings.csv")
print(ratings.fetch(5))

print(polars.select(['genres']).unique())

shape: (5, 4)
┌────────┬─────────┬────────┬────────────┐
│ userId ┆ movieId ┆ rating ┆ timestamp  │
│ ---    ┆ ---     ┆ ---    ┆ ---        │
│ i64    ┆ i64     ┆ f64    ┆ i64        │
╞════════╪═════════╪════════╪════════════╡
│ 1      ┆ 307     ┆ 3.5    ┆ 1256677221 │
│ 1      ┆ 481     ┆ 3.5    ┆ 1256677456 │
│ 1      ┆ 1091    ┆ 1.5    ┆ 1256677471 │
│ 1      ┆ 1257    ┆ 4.5    ┆ 1256677460 │
│ 1      ┆ 1449    ┆ 4.5    ┆ 1256677264 │
└────────┴─────────┴────────┴────────────┘
shape: (1642, 1)
┌─────────────────────────────────────┐
│ genres                              │
│ ---                                 │
│ str                                 │
╞═════════════════════════════════════╡
│ Adventure|Animation|Children|Com... │
│ Adventure|Children|Fantasy          │
│ Comedy|Romance                      │
│ Comedy|Drama|Romance                │
│ ...                                 │
│ Action|Comedy|Drama|Romance|Thri... │
│ Action|Adventure|Comedy|Horror|S... │
│ Children|Fanta

However we won't be using ratings yet. Lets create some features.

Most movies are tagged with multiple genres separated by |. There is proabably a good way to get the values but until I find that out...

In [7]:
polars = polars.filter(pl.col("genres") != "(no genres listed)") # filter out movies with no genres

polars.with_columns([
    pl.col('genres').str.contains('Action').alias("action"),
    pl.col('genres').str.contains('Horror').alias("horror"),
    pl.col('genres').str.contains('Drama').alias("drama"),
    pl.col('genres').str.contains('Comedy').alias("comedy"),
    pl.col('genres').str.contains('Documentary').alias("documentary"),
    pl.col('genres').str.contains('Adventure').alias("adventure"),
    pl.col('genres').str.contains('Fantasy').alias("fantasy"),
    pl.col('genres').str.contains('Children').alias("children"),
    pl.col('genres').str.contains('Sci-Fi').alias("scifi"),
    pl.col('genres').str.contains('Romance').alias("romance"),
    pl.col('genres').str.contains('Mystery').alias("mystery"),
    pl.col('genres').str.contains('Animation').alias("animation"),
    pl.col('genres').str.contains('Thriller').alias("thriller"),
])

# etc etc, testing if this work, then we scale it and find a way to iterate through all values because writing it out like this looks mad ugly

movieId,genres,action,horror,drama,comedy,documentary,adventure,fantasy,children,scifi,romance,mystery,animation,thriller
i32,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
1,"""Adventure|Anim...",false,false,false,true,false,true,true,true,false,false,false,true,false
2,"""Adventure|Chil...",false,false,false,false,false,true,true,true,false,false,false,false,false
3,"""Comedy|Romance...",false,false,false,true,false,false,false,false,false,true,false,false,false
4,"""Comedy|Drama|R...",false,false,true,true,false,false,false,false,false,true,false,false,false
5,"""Comedy""",false,false,false,true,false,false,false,false,false,false,false,false,false
6,"""Action|Crime|T...",true,false,false,false,false,false,false,false,false,false,false,false,true
7,"""Comedy|Romance...",false,false,false,true,false,false,false,false,false,true,false,false,false
8,"""Adventure|Chil...",false,false,false,false,false,true,false,true,false,false,false,false,false
9,"""Action""",true,false,false,false,false,false,false,false,false,false,false,false,false
10,"""Action|Adventu...",true,false,false,false,false,true,false,false,false,false,false,false,true


Now we have genres as boolean values, which should make it easier to find patterns. Probably?

In [35]:
ratings=pl.read_csv('../Data/ml-latest/ratings.csv',
    columns=['movieId', 'userId','rating'],
    dtypes={
        'movieId':pl.Int32,
        'userId':pl.Int32,
        'rating':pl.Float32,
        }
    )

ratings.head(10)

userId,movieId,rating
i32,i32,f32
1,307,3.5
1,481,3.5
1,1091,1.5
1,1257,4.5
1,1449,4.5
1,1590,2.5
1,1591,1.5
1,2134,4.5
1,2478,4.0
1,2840,3.0


In [38]:
best_movies = ratings.filter(pl.col("rating") >= 4.5) # filter out all ratings except 4.5 and 5.0
best_movies

userId,movieId,rating
i32,i32,f32
1,1257,4.5
1,1449,4.5
1,2134,4.5
1,3424,4.5
2,1296,4.5
2,2243,4.5
3,2028,5.0
4,6,4.5
4,25,4.5
4,32,4.5
