# ML-1M

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np

In [None]:
from preprocessor import load_data, preprocess_column, filter_data

In [None]:
data_name = "ml-1m"
path = f"../raw_data/{data_name}"
df_inter, df_user, df_item = load_data(path, data_name)

In [None]:
df_inter = preprocess_column(df_inter)
df_user = preprocess_column(df_user)
df_item = preprocess_column(df_item)

# Clean data and split

In [None]:
df_train, df_val, df_test, filtered_user, filtered_item = filter_data(data_name, df_inter, df_user, df_item)

# User grouping

In [None]:
# Age classification 3c 
# Mnemonic: resident_age_3c

# Total number of categories: 3
# https://www.ons.gov.uk/census/census2021dictionary/variablesbytopic/demographyvariablescensus2021/age/classifications
# Code	Name
# 1	Aged 24 years and under
# 2	Aged 25 to 49 years
# 3	Aged 50 years and over

map_age = {
    18:"18--24 years",
    25:"25--49 years",
    35:"25--49 years",
    45:"25--49 years",
    50:"$\geq$50 years",
    56:"$\geq$50 years",
}

map_occupation_2 = {
    1: "working",
    2: "working",
    3: "working",
    4: "non-working",
    5: "working",
    6: "working",
    7: "working",
    8: "working",
    9: "non-working",
    10: "non-working",
    11: "working",
    12: "working",
    13: "non-working",
    14: "working",
    15: "working",
    16: "working",
    17: "working",
    18: "working",
    19: "non-working",
    20: "working"
}


In [None]:
filtered_user["map_age"] = filtered_user.age.map(map_age)
filtered_user["map_occupation"] = filtered_user.occupation.map(map_occupation_2)

In [None]:
filtered_user["in_test"] = filtered_user["user_id"].isin(df_test.user_id)
filtered_user

In [None]:
filtered_test_user = filtered_user[filtered_user.in_test]

In [None]:
from stats import print_stats

val_count_gender = filtered_test_user.value_counts("gender")
val_count_age = filtered_test_user.value_counts("map_age")
val_count_occup = filtered_test_user.value_counts("map_occupation")

In [None]:
print_stats(val_count_gender)
print_stats(val_count_age)
print_stats(val_count_occup)

In [None]:
filtered_user_grouping = filtered_test_user\
            .groupby(["gender","map_age","map_occupation"])[["user_id"]]\
            .count()\
            .rename(columns={"user_id":"\#user"})\
            .reindex(["M","F"], level=0)\
            .reindex(["18--24 years", "25--49 years","$\geq$50 years",], level=1)
filtered_user_grouping

In [None]:
print(filtered_user_grouping.to_latex())

In [None]:
filtered_user_grouping.describe().loc[["count","min","50%","max"]].astype(int)

# Check duplicate item names

In [None]:
val_count_movie_title = filtered_item.movie_title.value_counts()
val_count_movie_title[val_count_movie_title>1]

In [None]:
filtered_item[filtered_item.movie_title.str.lower().duplicated(keep=False)]\
            .sort_values("movie_title")\
            .head(50)

# Clean item names to fix the ordering of The/A/An/La/Les

In [None]:
from preprocessor import clean_movie_name

In [None]:
filtered_item = filtered_item[["item_id", "movie_title"]]
filtered_item

In [None]:
# get an idea of what to clean
filtered_item.movie_title.str.split(",").apply(lambda x: x[1] if len(x)>1 else None).dropna().value_counts().head(50)

In [None]:
#clean item names from the "name, article" 

filtered_item["cleaned_title"] = filtered_item.movie_title.apply(clean_movie_name)

In [None]:
#check difference
filtered_item[filtered_item.movie_title!=filtered_item.cleaned_title].to_csv("to_annotate/check_cleaned_movie_name.csv")

In [None]:
filtered_item = filtered_item\
                            .drop(columns="movie_title")\
                            .rename(columns={"cleaned_title":"movie_title"})

# Stats & save

In [None]:
from stats import get_df_stats
from preprocessor import prepare_to_save, general_save

In [None]:
df_train, df_val, df_test, filtered_item = prepare_to_save(df_train, df_val, df_test, filtered_user, filtered_item)

In [None]:
#save the splits and filtered metadata
get_df_stats(df_train, df_val, df_test, filtered_user, filtered_item)

# general save
data_name = "../cleaned_data/ml-1m"

general_save(df_train, df_val, df_test, filtered_user, filtered_item, data_name)