# MS&E 234 Project

### Data cleaning

In [1]:
import pandas as pd
import numpy as np
import re
from random import sample
import os
import itertools
from collections import defaultdict
from tqdm.notebook import tqdm
import json

### Load data

In [2]:
df_raw = pd.DataFrame()
path = os.getcwd()
for i in range(1, 4+1):
    df_raw = pd.concat([df_raw, pd.read_csv(f'{path}/netflix-prize-kaggle-data/combined_data_{i}.txt',
        header=None,
        names=['CustomerID', 'Rating', 'Date'])])

df_raw

Unnamed: 0,CustomerID,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26
...,...,...,...
26851921,1790158,4.0,2005-11-01
26851922,1608708,3.0,2005-07-19
26851923,234275,1.0,2004-08-07
26851924,255278,4.0,2004-05-28


### Subset data as in paper (Calandrino 2011)

In [3]:
df = df_raw
print(f'Entire dataset: {df.shape[0]} ratings; {df.CustomerID.nunique()} users')

# Sample 10000 users
df = df[df['CustomerID'].isin(sample(df['CustomerID'].unique().tolist(), 10000))]
print(f"Subset of 10000 users: {df.shape[0]} ratings; {df['CustomerID'].nunique()} users")

# Assume each user makes a random 50% of transactions public
idxs = df[['CustomerID']].reset_index().groupby('CustomerID').agg({'index':lambda x: list(x)}).to_numpy().tolist()
drop_idxs = []
for i in range(len(idxs)):
    drop_idxs.append(sample(idxs[i][0], len(idxs[i][0]) // 2))
drop_idxs = list(itertools.chain(*drop_idxs))
df = df.drop(drop_idxs)
print(f'After making 50% private: {df.shape[0]} ratings; {df.CustomerID.nunique()} users')

# Only consider users with at least 100 public transactions
df = df[df['CustomerID'].isin(df['CustomerID'].value_counts()[df['CustomerID'].value_counts() > 100].index)]
print(f'Filter for users with >= 100 public transactions: {df.shape[0]} ratings; {df.CustomerID.nunique()} users')

# Subset the data to analyze only ratings from July 2005
df = df[df['Date'].apply(lambda x: type(x) == str and bool(re.match(r'2005-07.*', x)))]
print(f'Only July 2005: {df.shape[0]} ratings; {df["CustomerID"].nunique()} users')

Entire dataset: 100498277 ratings; 497959 users
Subset of 10000 users: 2036327 ratings; 10000 users
After making 50% private: 991486 ratings; 9984 users
Filter for users with >= 100 public transactions: 759010 ratings; 2971 users
Only July 2005: 29960 ratings; 1858 users


In [4]:
df

Unnamed: 0,CustomerID,Rating,Date
1364,2369855,4.0,2005-07-13
3194,501823,5.0,2005-07-10
4052,1504575,2.0,2005-07-22
4856,1333911,4.0,2005-07-19
6009,1987812,3.0,2005-07-11
...,...,...,...
26830900,403590,4.0,2005-07-15
26838254,891357,5.0,2005-07-30
26838922,2002963,3.0,2005-07-16
26846406,2260860,3.0,2005-07-14


### TODO - Mac Comments

In [None]:
# Sanity check: there are ~1570 such users
# Sanity check: there are around 1510 transactions during the period in question 

# TODO there are way more transactions - how do we get only ~1 transaction per user per month, 
# when we are filtering for users that recommend many movies? Seems plausible that these users are 
# making ~20 transactions/month...

# Restrict the attack to (customer, date) pairs in which the customer made 5 or fewer transactions: 
# TODO I think this means only compute prediction accuracy on these pairs, rather than removing them before training

In [5]:
# na's represent the number of movies in the dataset
df_raw.isna().sum()

CustomerID        0
Rating        17770
Date          17770
dtype: int64

In [6]:
# extract movie ID's from raw data, combine these with df
# The data files follow the following format:
# Movie ID:
# CustomerID, Rating, Date
# ...

movieDF = df_raw[df_raw['Rating'].isnull()]

In [7]:
# simulate recommendation system with full knowledge of July's activity
july_data = df_raw[df_raw['Date'].apply(lambda x: type(x) == str and bool(re.match(r'2005-07.*', x)))]

In [8]:
movieRows = np.array(movieDF.index)
movieIDs = []
currIdx = 0
for row in july_data.itertuples(index = True, name = 'Pandas'):
    currRow = row.Index
    while currIdx < len(movieRows) - 1:
        nextMovieRow = movieRows[currIdx + 1]
        if currRow > nextMovieRow:
            currIdx += 1
        else:
            break
    movieIDs.append(currIdx + 1) # since index of movie IDs starts at 1

In [9]:
july_data['MovieIDs'] = movieIDs
july_data['Day'] = pd.DatetimeIndex(july_data['Date']).day

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  july_data['MovieIDs'] = movieIDs
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  july_data['Day'] = pd.DatetimeIndex(july_data['Date']).day


In [11]:
# input: 2 sets x, y
# output: for binary vectors x and y: cosine similarity = |x and y| / sqrt(|x||y|)
def getCosSim(x, y):
    return len(x.intersection(y)) / np.sqrt(len(x) * len(y))

In [12]:
# input: sparse binary adjList
# output: dict of the 50 most similar items and scores, in format: dict[movieID] => ([(items, scores)])
def getSimListDaily(adjList):
    simList = {}
    for key1 in adjList.keys():
        cosSims = []
        for key2 in adjList.keys():
            if key1 == key2:
                continue
            cosSim = getCosSim(adjList[key1], adjList[key2])
            cosSims.append((key2, cosSim))
        simList[key1] = sorted(cosSims, key = lambda x: (-x[1], x[0]))[:50]
    return simList

In [13]:
# input: dataframe of customerIDs, Ratings, Day of Month, and MovieIDs
# input dataframe should contain both public and private data, as the recc system uses all available information
# output: list of 50 related movies and similarity ratings per day
# on each consecutive day, more data is used by the recc system
# uses cosine similarity on a binary matrix
def getSimListMonthly(df):
    # store binary matrix in sparse adjacency list format
    # adjList[movieID] => set of customerIDs
    adjList = defaultdict(set)
    
    simLists = []
    
    for day in sorted(df["Day"].unique()):
        # add new movies from today to adjList
        currDF = df[df["Day"] == day]
        for row in currDF.itertuples(index = True, name = 'Pandas'):
            adjList[row.MovieID].add(row.CustomerID)
        
        # compute similarity scores
        currSimList = getSimListDaily(adjList)
        simLists.append(currSimList)
    return simLists

In [None]:
# generates the similarity lists for all movies in July
# saves list to a JSON file for easy use

'''simLists = getSimListMonthly(july_data)
simListsFile = 'july_simLists.json'
with open(f'{path}/{simListsFile}', 'w+') as rec_sys:
    json.dump(simLists, rec_sys)'''

In [38]:
# reads JSON file with similarity lists for all movies in July
sim_list_file = 'july_simLists.json'
july_sim_list = []
with open(f'{path}/{sim_list_file}', 'r') as rec_sys:
    july_sim_list = json.load(rec_sys, parse_int=int)

# converts string JSON keys to ints (JSON format saves all keys as strings)
for day in range(len(july_sim_list)):
    july_sim_list[day] = {int(k):[tuple(i) for i in v] for k,v in july_sim_list[day].items()}

In [39]:
# shows the top 5 movies related to movie id (second value) on the last day of July 2005
july_sim_list[30][2][0:5]

[(2721, 0.10314212462587934),
 (2625, 0.0842151921066519),
 (4082, 0.0842151921066519),
 (355, 0.07293249574894728),
 (3907, 0.07293249574894728)]

## Part 3: Inference Algorithm

### Questions, TODOs, Comments

In [None]:
user_inferences = {}
observation_window = 1

In [None]:
# generate list of unique user ids
users = df['CustomerID'].unique()
for user in users:
    auxiliary_info = df[df['CustomerID'] == user]['MovieID'].unique()

### TODO - Viet Comments 2/27

added code to add movieIDs to df

wrote code to generate daily top 50 cosine similarities + scores

regarding Mac's comment: 
"Sanity check: there are around 1510 transactions during the period in question"
"TODO there are way more transactions - how do we get only ~1 transaction per user per month, 
when we are filtering for users that recommend many movies? Seems plausible that these users are making ~20 transactions/month..."

I think the paper made a mistake - there are ~ 35K transactions per month, maybe the 1.5K is referring to daily transactions? I am getting 6.4K (customer, date) pairs.

# for whoever is generating the plots: make sure the train and test sets are properly built.

currently the df generated removes 50% of entries.

When generating the similarity lists, we want to keep 100% of entries.

When making predictions, we get to look at 50% of entries.

In [None]:
df.groupby(["CustomerID", "Day"]).sum()