In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%cd ..

/home/iovcharenko/Documents/NotWork/UCU/liner-algebra/ucu-linear-algebra-final-project


In [3]:
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from pathlib import Path
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

from src.data import get_netflix_dataframe
from src.data import combine_dataframes
from src.data import generate_sparce_matrix

In [3]:
data_folder = Path("data")

In [4]:
df = get_netflix_dataframe("data/combined_data_1.txt")

100%|██████████| 24058263/24058263 [00:14<00:00, 1673642.25it/s]


In [343]:
subsets_folder = data_folder / "subsets"

subsets = [
    (subsets_folder / "high-sparsity", (0.50, 0.80)), # 0.957 sparsity 
    (subsets_folder / "mid-sparsity", (0.80, 0.90)),   # 0.901 sparsity
    (subsets_folder / "low-sparsity", (0.90, 0.95)),   # 0.850 sparsity
]

customer_amount = 1000
movie_amount = 1000

for (folder_name, q) in subsets:
    print(f"process folder: {folder_name}")
    
    np.random.seed(42)
    folder_name.mkdir(parents=True, exist_ok=True)
    
    user_rates = df.groupby("customer_id").size()

    lower_q = np.quantile(user_rates.values, q[0])
    upper_q = np.quantile(user_rates.values, q[1])

    user_rates = user_rates[user_rates.values >= lower_q]
    user_rates = user_rates[user_rates.values <= upper_q]

    selected_customer_ids = np.random.choice(
        user_rates.index.values, 
        customer_amount, replace=False,
    )

    small_df = df[df.customer_id.isin(selected_customer_ids)]
    selected_movie_ids = small_df.groupby("movie_id") \
                        .size().sort_values(ascending=False)[:movie_amount].index.values

    small_df = small_df[small_df.movie_id.isin(selected_movie_ids)]

    
    customer_ids = small_df.customer_id.sort_values().unique().tolist()
    small_df.customer_id = small_df.customer_id.apply(lambda i: customer_ids.index(i))

    movie_ids = small_df.movie_id.sort_values().unique().tolist()
    small_df.movie_id = small_df.movie_id.apply(lambda i: movie_ids.index(i))

    
    small_df.date = pd.to_datetime(small_df.date)
    small_df = small_df.sort_values("date")

    small_df = small_df.copy()
    small_df = small_df.reset_index(drop=True)
    
    
    tr_idx, val_idx = train_test_split(
        small_df.index, 
        test_size=0.2,
        shuffle=True, 
        random_state=42,
        stratify=small_df.customer_id
    )
    small_df.loc[tr_idx, "split"] = "train"
    small_df.loc[val_idx, "split"] = "val"
    
    print(
        "shape: ",
        small_df.customer_id.unique().shape, 
        small_df.movie_id.unique().shape
    )

    sparse_m = csr_matrix((
        small_df.rating.values, 
        (small_df.customer_id.values, small_df.movie_id.values)
    ))

    dense_m = sparse_m.todense()
    sparsity = (dense_m == 0).sum() / dense_m.size
    print(f"sparsity: {sparsity}")
    
    
    small_df.to_csv(folder_name / "records.csv", index=False)
    
    pd.DataFrame({
        "subset_id": range(0, len(movie_ids)),
        "netflix_id": movie_ids,
    }).to_csv(folder_name / "movies.csv", index=False)


    pd.DataFrame({
        "subset_id": range(0, len(customer_ids)),
        "netflix_id": customer_ids,
    }).to_csv(folder_name / "custormers.csv", index=False)

process folder: data/subsets/hight-sparsity
shape:  (1000,) (1000,)
sparsity: 0.957375
process folder: data/subsets/mid-sparsity
shape:  (1000,) (1000,)
sparsity: 0.905062
process folder: data/subsets/low-sparsity
shape:  (1000,) (1000,)
sparsity: 0.855243


## Custom train test split

In [None]:
def custom_train_test_split(df, test_size=0.2):
    np.random.seed(42)
    df = df.copy()
    df = df.sample(frac=1)

    m_count = {m: 0 for m in df.movie_id.unique()}
    c_count = {c: 0 for c in df.customer_id.unique()}

    test_df = pd.DataFrame()
    items_size = len(df)
    m_skipped = False
    c_skipped = False

    while len(test_df) / items_size < test_size:
        m_items = list(m_count.items())
        random.shuffle(m_items)
        m_count = list(sorted(m_items, key=lambda x: x[1]))
        m_id = m_count[0][0]
        m_count = dict(m_count)

        m_rows_idx = df[df.movie_id == m_id].index
        if not m_rows_idx.empty:
            m_count[m_id] += 1
            m_selected = df.loc[m_rows_idx] \
                           .sort_values("customer_id", key=lambda c: c.map(c_count)).index[0]
#             m_selected = np.random.choice(m_rows_idx)
            c_count[df.loc[m_selected].customer_id] += 1
            test_df = test_df.append(df.loc[m_selected])
            df = df.drop(index=m_selected)
        else:
            m_skipped = True
            print("skip movie select")

        c_items = list(c_count.items())
        random.shuffle(c_items)
        c_count = list(sorted(c_items, key=lambda x: x[1]))
        c_id = c_count[0][0]
        c_count = dict(c_count)
        c_rows_idx = df[df.customer_id == c_id].index
        if not c_rows_idx.empty:
            c_count[c_id] += 1
            c_selected = df.loc[c_rows_idx] \
                           .sort_values("movie_id", key=lambda m: m.map(m_count)).index[0]
#             c_selected = np.random.choice(c_rows_idx)
            m_count[df.loc[c_selected].movie_id] += 1
            test_df = test_df.append(df.loc[c_selected])
            df = df.drop(index=c_selected)
        else:
            c_skipped = True
            print("skip customer select")

        if m_skipped and c_skipped:
            print("break, all skipped")
            break

        print("fraction: ", len(test_df) / items_size)
        
    return df.index, test_df.index, len(test_df) / items_size

tr_idx, val_idx, frac = custom_train_test_split(small_df, 0.2)

In [None]:
small_df.loc[tr_idx, "split"] = "train"
small_df.loc[val_idx, "split"] = "val"


print(len(set(small_df[small_df.split == "train"].customer_id)))
print(len(set(small_df[small_df.split == "val"].customer_id)))

print(len(set(small_df[small_df.split == "train"].movie_id)))
print(len(set(small_df[small_df.split == "val"].movie_id)))