# Data processing

In [8]:
import json
import pandas as pd

## 1. Reformatting JSON data to csv
In the previous steps we collected user-streamer follower data in JSON format. We'll need to convert this to csv in order to work with scipy sparse matrices and fit our models.

In [2]:
with open("data/user_follows.json", "r") as f:
    user_data = json.load(f)

In [3]:
print("Number of users:", len(user_data))

Number of users: 15997


In [4]:
# user_data is a dictionary with keys which are user IDs and 
# values like {"total": 85, "following":[[streamer_name1, time_followed1], ...]]} 
list(user_data.keys())[0]

'500301825'

In [5]:
list(user_data['500301825'].keys())

['total', 'following']

In [6]:
user_data['500301825']['following'][:5]

[['hakumai', '2020-11-05T09:16:30Z'],
 ['DesertHeartsRecords', '2020-11-05T09:13:39Z'],
 ['진진스', '2020-11-01T17:38:39Z'],
 ['DanTheLionTV', '2020-10-21T18:02:03Z'],
 ['HAchubby', '2020-10-06T20:01:48Z']]

In [9]:
# subset the streamers to those that are contained within the top English streamers
top_eng = pd.read_csv("data/streamer_info_eng.csv")["name"].values
all_data = []
for user_ID in user_data:
    follows = user_data[user_ID]["following"]
    follows_eng = [[user_ID] + x for x in follows if x[0] in top_eng]
    all_data = all_data + follows_eng
    
df = pd.DataFrame(data=all_data, columns=["user_ID", "streamer_name", "followed_at"])

In [10]:
df.head()

Unnamed: 0,user_ID,streamer_name,followed_at
0,500301825,DesertHeartsRecords,2020-11-05T09:13:39Z
1,500301825,HAchubby,2020-10-06T20:01:48Z
2,500301825,NoHaileeNo,2020-07-14T08:58:40Z
3,500301825,melina,2020-06-21T15:22:03Z
4,500301825,EsfandTV,2020-03-19T01:36:38Z


In [11]:
df.shape

(288713, 3)

In [12]:
df.to_csv("data/user_follows.csv", index=False)

## 2. Removing users and items with insufficient data
We remove users with insufficient items (<= 4) and then items with insufficient users (<= 4) because we assume we will not be able to make accurate predictions for these.

Note, removing items with <= 4 users may cause some users to again have < 4 items. We could've repeated this process until no "bad" users or items were present; however, we checked after 1 iteration and only 3 users were left as "bad" -- so we decided to stop here. 

In [13]:
df = pd.read_csv("data/user_follows.csv")

# 1. Removing users with insufficient items
df_group = df.groupby("user_ID")
df_num_follows = df_group.size()
print("Number of unique users:", len(df_num_follows))
print("Number of users that are following at least 4 streamers", (df_num_follows > 4).sum())
print("\n")
good_users = df_num_follows[df_num_follows > 4].index
df = df[df["user_ID"].isin(good_users)]

# 2. Removing items with insufficient users
df_group = df.groupby("streamer_name")
df_num_follows = df_group.size()
print("Number of unique streamers:", len(df_num_follows))
print("Number of streamers that have at least 4 follows", (df_num_follows > 4).sum())
good_streamers = df_num_follows[df_num_follows > 4].index
df = df[df["streamer_name"].isin(good_streamers)]

df.shape

Number of unique users: 15994
Number of users that are following at least 4 streamers 10985


Number of unique streamers: 1974
Number of streamers that have at least 4 follows 1904


(277298, 3)

In [14]:
df.to_csv("data/user_follows-2.csv", index=False)