In [None]:
import random
import pandas as pd
import numpy as np


random.seed(102)

from datasketching.minhash import SimpleMinhash
from datasketching.minhash import murmurmaker

In [None]:
df = pd.read_parquet("data/music.parquet")

df = df.drop(df[df["2"].str.len() > 60].index) # we remove long band names.

df.sample(10, random_state=1)

In [None]:
def generate_minhash_sig(user_dat, nhash):
    mh = SimpleMinhash(nhash)
    for row in user_dat:
        mh.add(row)
    return mh

def unique_artists(df):
    return df['2'].unique()

In [None]:
grouped_df = df.groupby(['0'])
un_artists = grouped_df.apply(unique_artists)
mh_sigs = un_artists.apply(generate_minhash_sig, nhash = 128)

users = df['0'].unique()
dusers = {x+1:y for x,y in enumerate(sorted(set(users)))}

In [None]:
new_users = pd.DataFrame( columns=['user', 'artist','plays'])    
ii = 0 

for u in range(0, 992):    
    print(u)
    x = mh_sigs[u]
    artists_listened = len(un_artists[u])
    to_sample = int(np.floor(artists_listened)*0.05)
    sim=[]
    for mh in range(0, 992):
        sim.append(mh_sigs[mh].similarity(mh_sigs[0]))
    
    similar = set(sorted(sim, reverse=True)[1:11]) # the ten largest similarities
    similar_users = ([i for i, e in enumerate(sim) if e in similar]) # extract the user values
    
    
    user_play_fr = grouped_df.get_group(dusers[(u+1)]).groupby(['2']).count()['1'].values
    
    
    for j in range(0, 50):
        # print(j)
        ### make 5 new users for each user
        username = 'user' + f"{u:03}" + f"{j:02}"
        #print(username)
        selected = random.sample(similar_users, 6)
        listened = []
        for k in selected:
            possible = np.setdiff1d(un_artists[k], (list(un_artists[u])+listened))
            listened = listened + list(np.random.choice(un_artists[k], size = to_sample, replace = False))
            
        listened = listened + list(np.random.choice(un_artists[u], size=int(np.floor(artists_listened*0.7))))
        
        ### now simulate user plays. 
        user_plays = np.random.choice(user_play_fr, size=len(listened), replace = False)
        
        user_data = {'user':np.repeat(username,len(listened), axis=0) , 'artist':listened, 'plays':user_plays} 
        user_df = pd.DataFrame(user_data) 
        new_users = pd.concat([new_users, user_df])
        
    ii = ii + 1
    #print(ii)
    if ii == 20:
        ### write file to parquet every 10th user, and begin a new file
        filename='data/userdat'+str(u)+'.parquet'
        print(filename)
        new_users.to_parquet(filename)
        ii = 0
        new_users = pd.DataFrame( columns=['user', 'artist','plays'])    

In [None]:
filename='data/userdat'+str(u)+'.parquet'
print(filename)
new_users.to_parquet(filename)

# loading data
 write it to parquet files grouped by artist - smaller than grouped by user. 




In [None]:
pseudo_data = pd.DataFrame(columns=['user', 'artist','plays'])
for j in f:
    df = pd.read_parquet('data/'+j)
    pseudo_data = pd.concat([pseudo_data, df])
    print(pseudo_data.shape)
    
df = pseudo_data

In [None]:
## remove data that causes error 
df = df.drop(df[df['artist'].str.contains('G\ufff6L￤')].index)
df = df.drop(df[df['artist'].str.contains('ﾔﾵﾁﾸ\uffd0\uffd0\uffd0\uffd0\uffd0')].index)
df = df.drop(df[df["artist"].str.contains('䐀攀愀琀栀\u2000昀爀漀洀\u2000䄀戀漀瘀攀\u2000\u3100㤀㜀㤀')].index)

In [None]:
import pyarrow
import pyarrow.parquet as pq

table = pyarrow.Table.from_pandas(df)

pq.write_to_dataset(
    table, 
    root_path='partitioned_output.parquet',
    partition_cols=['artist'],)