## LFM-1B

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

from preprocessor import load_data, preprocess_column, filter_data

data_name = "lfm1b-artists"
path = f"../raw_data/{data_name}"

In [None]:
df_inter, df_user, df_item = load_data(path, data_name)

In [None]:
df_inter = preprocess_column(df_inter)
df_user = preprocess_column(df_user)
df_item = preprocess_column(df_item)

In [None]:
df_inter

# Clean and split data

In [None]:
df_train, df_val, df_test, filtered_user, filtered_item = filter_data(data_name, df_inter, df_user, df_item, core_filter=50)

# Group users

## Gender & age

In [None]:
filtered_user.gender = filtered_user.gender.str.upper()

In [None]:
min_age = filtered_user.age.min()
max_age = filtered_user.age.max()

In [None]:
age_label = ["young", "adult", "older-adult"]

filtered_user["map_age"] = pd.cut(filtered_user["age"], 
                                        [min_age-1,24, 49, max_age], 
                                        labels=age_label)


## Map country to continent

In [None]:
country_code_table = pd.read_csv("https://gist.githubusercontent.com/achuhunkin/6cb1cbceb23395300aa209aad09e6e5d/raw/f5bc73e0de4f0d7d441795531cec1b4e950e5b61/countries.csv")
country_code_table[country_code_table.duplicated("Two_Letter_Country_Code", keep=False)]

In [None]:
#countries that are part of two continents, manual selection
index_to_drop = [8,16,58,82,114,169, 189, 190, 192,233, 235, 244, 245,247, 253, 263]
country_code_table = country_code_table.drop(index=index_to_drop)


In [None]:
list_country_code = country_code_table.Two_Letter_Country_Code
list_continent = country_code_table.Continent_Name
list_country_name = country_code_table.Country_Name

map_code_country = dict(zip(list_country_code, list_country_name))
map_code_continent = dict(zip(list_country_code, list_continent))

In [None]:
filtered_user.country[~filtered_user.country.isin(country_code_table.Two_Letter_Country_Code)].unique()

In [None]:
#change GB to UK
filtered_user.loc[filtered_user.country=="UK", "country"] = "GB"

In [None]:
filtered_user["country_name"] = filtered_user.country.map(map_code_country)
filtered_user["continent"] = filtered_user.country.map(map_code_continent)

In [None]:
filtered_user.loc[filtered_user.country=="AQ", "country_name"] = "Antarctica"
filtered_user.loc[filtered_user.country=="AQ", "continent"] = "Antarctica"

In [None]:
assert filtered_user[filtered_user.country.isna()].shape[0] == 0
assert filtered_user[filtered_user.continent.isna()].shape[0] == 0

In [None]:
map_continent = {
    "Europe":"Europe",
    "North America": "America & Antarctica",
    "South America": "America & Antarctica",
    "Asia": "Asia",
    "Oceania":"Oceania",
    "Africa":"Africa",
    "Antarctica":"America & Antarctica"
}
filtered_user["map_continent"] = filtered_user.continent.map(map_continent)

In [None]:
filtered_user.continent.value_counts()

In [None]:
filtered_user.map_continent.value_counts()

# Filter test users

In [None]:
filtered_user["in_test"] = filtered_user.user_id.isin(df_test.user_id)
filtered_test_user = filtered_user[filtered_user.in_test]
filtered_test_user

In [None]:
filtered_test_user.gender.value_counts()

In [None]:
from stats import print_stats

In [None]:
val_count_age = filtered_test_user.map_age.value_counts()

print_stats(val_count_age)

In [None]:
val_count_cont = filtered_test_user.map_continent.value_counts()
print_stats(val_count_cont)

In [None]:
sort_continent = filtered_test_user.map_continent.value_counts().index

In [None]:
filtered_user_grouping = filtered_test_user\
                                .groupby(["gender","map_age","map_continent"])[["user_id"]]\
                                .count()\
                                .rename(columns={"user_id":"\#user"})\
                                .reindex(age_label, level=1)\
                                .reindex(sort_continent, level=2)
filtered_user_grouping = filtered_user_grouping[filtered_user_grouping["\#user"]>0]
filtered_user_grouping

In [None]:
print(filtered_user_grouping.to_latex())

In [None]:
filtered_user_grouping\
                    .describe()\
                    .loc[["count", "min", "50%", "max"]]\
                    .astype(int)

# Stats and save

In [None]:
from stats import get_df_stats
from preprocessor import prepare_to_save, general_save

df_train, df_val, df_test, filtered_item = prepare_to_save(df_train, df_val, df_test, filtered_user, filtered_item)

In [None]:
#save the splits and filtered metadata
get_df_stats(df_train, df_val, df_test, filtered_user, filtered_item)

In [None]:
# general save
data_name = "../cleaned_data/lfm-1b"
general_save(df_train, df_val, df_test, filtered_user, filtered_item, data_name)