# MS&E 234 Project

### Data cleaning

In [38]:
import pandas as pd
import numpy as np
import re
from random import sample
import os
import itertools
from collections import defaultdict,Counter
from tqdm.notebook import tqdm
import pickle

### Step 1: Load data

In [14]:
df_raw = pd.DataFrame()
path = os.getcwd()
for i in range(1, 4+1):
    df_raw = pd.concat([df_raw, pd.read_csv(f'{path}/netflix-prize-kaggle-data/combined_data_{i}.txt',
        header=None,
        names=['CustomerID', 'Rating', 'Date'])])

df_raw

Unnamed: 0,CustomerID,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26
...,...,...,...
26851921,1790158,4.0,2005-11-01
26851922,1608708,3.0,2005-07-19
26851923,234275,1.0,2004-08-07
26851924,255278,4.0,2004-05-28


### Step 2: Generate Training & Testing Datasets

In [26]:
def generate_dataset(df, users, train):
    df = df[df['CustomerID'].isin(users)]
    print(f"Subset of 10000 users: {df.shape[0]} ratings; {df['CustomerID'].nunique()} users")

    if train:
        # assume 50% of the transactions are public
        idxs = df[['CustomerID']].reset_index().groupby('CustomerID').agg({'index':lambda x: list(x)}).to_numpy().tolist()
        drop_idxs = []
        for i in range(len(idxs)):
            drop_idxs.append(sample(idxs[i][0], len(idxs[i][0]) // 2))
        drop_idxs = list(itertools.chain(*drop_idxs))
        print(f'After making 50% private: {df.shape[0]} ratings; {df.CustomerID.nunique()} users')
    
        # only consider users with at least 100 public transactions
        df = df[df['CustomerID'].isin(df['CustomerID'].value_counts()[df['CustomerID'].value_counts() >= 100].index)]
        print(f'Filter for users with >= 100 public transactions: {df.shape[0]} ratings; {df.CustomerID.nunique()} users')

    # Subset the data to analyze only ratings from July 2005
    df = df[df['Date'].apply(lambda x: type(x) == str and bool(re.match(r'2005-07.*', x)))]
    print(f'Only July 2005: {df.shape[0]} ratings; {df["CustomerID"].nunique()} users')

    return df

In [18]:
# extract movie ID's from raw data, combine these with df
# The data files follow the following format:
# Movie ID:
# CustomerID, Rating, Date
# ...
def add_movie_info(df, movie_df):
    # get movie id boundaries
    movie_rows = np.array(movie_df.index)

    # create a column with movie ids to add to df
    movie_ids = []
    curr_idx = 0
    for row in df.itertuples(index = True, name = 'Pandas'):
        curr_row = row.Index
        while curr_idx < len(movie_rows) - 1:
            next_movie_row = movie_rows[curr_idx + 1]
            if curr_row > next_movie_row:
                curr_idx += 1
            else:
                break
        movie_ids.append(curr_idx + 1) # since index of movie IDs starts at 1
    
    # add movie ids and days
    df['MovieID'] = movie_ids
    df['Day'] = pd.DatetimeIndex(df['Date']).day

    return df

In [20]:
# get movie ids
movie_df = df_raw[df_raw['Rating'].isnull()]

In [27]:
# generate a random sample of users
users = sample(df_raw.dropna()['CustomerID'].unique().tolist(), 10000)
train = generate_dataset(df_raw, users, train=True)
train = add_movie_info(train, movie_df)

Subset of 10000 users: 2071856 ratings; 10000 users
After making 50% private: 2071856 ratings; 10000 users
Filter for users with >= 100 public transactions: 1855510 ratings; 4917 users
Only July 2005: 84211 ratings; 2984 users


In [30]:
test = generate_dataset(df_raw, users, train=False)
test = add_movie_info(test, movie_df)

Subset of 10000 users: 2071856 ratings; 10000 users
Only July 2005: 99164 ratings; 4384 users


In [39]:
# save files for easy loading
train.to_csv(path + '/train.csv')
test.to_csv(path + '/test.csv')

### Step 3: Generate Related Movies' Lists

In [31]:
# input: 2 sets x, y
# output: for binary vectors x and y: cosine similarity = |x and y| / sqrt(|x||y|)
def getCosSim(x, y):
    return len(x.intersection(y)) / np.sqrt(len(x) * len(y))

In [32]:
# input: sparse binary adjList
# output: dict of the 50 most similar items and scores, in format: dict[movieID] => ([(items, scores)])
def getSimListDaily(adjList):
    simList = {}
    for key1 in adjList.keys():
        cosSims = []
        for key2 in adjList.keys():
            if key1 == key2:
                continue
            cosSim = getCosSim(adjList[key1], adjList[key2])
            cosSims.append((key2, cosSim))
        simList[key1] = sorted(cosSims, key = lambda x: (-x[1], x[0]))[:50]
    return simList

In [34]:
# input: dataframe of customerIDs, Ratings, Day of Month, and MovieIDs
# input dataframe should contain both public and private data, as the recc system uses all available information
# output: list of 50 related movies and similarity ratings per day
# on each consecutive day, more data is used by the recc system
# uses cosine similarity on a binary matrix
def getSimListMonthly(df):
    adjList = defaultdict(set)
    
    simLists = []
    
    for day in sorted(df["Day"].unique()):
        # add new movies from today to adjList
        currDF = df[df["Day"] == day]
        for row in currDF.itertuples(index = True, name = 'Pandas'):
            adjList[row.MovieID].add(row.CustomerID)
        
        # compute similarity scores
        currSimList = getSimListDaily(adjList)
        simLists.append(currSimList)
    return simLists

In [35]:
sim_list = getSimListMonthly(test)

In [37]:
# shows the top 5 movies related to movie id (second value) on the last day of July 2005
sim_list[30][8][0:5]

[(3797, 0.18257418583505536),
 (534, 0.12909944487358055),
 (774, 0.12909944487358055),
 (1114, 0.12909944487358055),
 (1355, 0.12909944487358055)]

In [40]:

# save sim_list for easy usage
sim_list_file = path + '/sim_list.pkl'
with open(sim_list_file, 'wb') as f:
    pickle.dump(sim_list, f)

## Part 3: Inference Algorithm

### Questions, TODOs, Comments
- For the first observation window, should we be adding all 50 related movies to the window? Maybe we should set a similarity threshold for the value?
- Should this be per transaction or per film (some users watch the same film multiple times)?
- How should we handle auxiliary items that end up back in the inferred items?

In [42]:
# read in files
path = os.getcwd()
train = pd.read_csv(path + '/train.csv')
test = pd.read_csv(path + '/test.csv')

sim_list_file = path + '/sim_list.pkl'
sim_list = None
with open(sim_list_file, 'rb') as f:
    sim_list = pickle.load(f)

In [222]:
user_inferences = {}
observation_window = 1
threshold = 0.50 # a movie is considered an inference if it shows up in majority of the similarity lists for the auxiliary information

In [223]:
# generate list of unique user ids
users = df['CustomerID'].unique()
sample_users = users[0:5]

In [227]:
for user in users:
    user_inferences[user] = []
    auxiliary_info = df.loc[df['CustomerID'] == user, 'MovieID'].unique()
    known_movie_count = 0
    for curr_window in range(0, 31, observation_window):
        window_deltas = []
        # print(f'Current window: {curr_window}')
        for known_movie in auxiliary_info:
            movie_deltas = []
            prev_window = curr_window - observation_window
            # print(f'Prev window: {prev_window}')
            # option 1: the movie has been seen by the rec system before in the daily rankings
            if curr_window > 0 and known_movie in july_sim_list[prev_window].keys():
                # print(f'Movie {known_movie} is in July {curr_window + 1}\'s rec system')
                known_movie_count += 1
                # get movie rankings
                prev_rankings = [movie_sim[0] for movie_sim in july_sim_list[prev_window][known_movie]]
                curr_rankings = [movie_sim[0] for movie_sim in july_sim_list[curr_window][known_movie]]
                # print(f'Movie #{known_movie} previous rankings: {prev_rankings}')
                # print(f'Movie #{known_movie} current rankings: {curr_rankings}')

                # get movies in both simlarity list for the previous window and current window
                target_movies = set(curr_rankings).intersection(set(prev_rankings))
                # print(f'Movie #{known_movie} target movies: {target_movies}')
                
                # add any movies that were added to rankings for the current window
                movie_deltas = [movie for movie in curr_rankings if movie not in target_movies]
                # print(f'Movie #{known_movie} new movies added to ranking: {movie_deltas}')

                # get movies that increased their similarity from the previous window
                movie_deltas += [movie for movie in target_movies if movie in curr_rankings and (curr_rankings.index(movie) - prev_rankings.index(movie)) < 0] # note: higher index means moving down the ranking
                # print(f'Movie #{known_movie} had the following increases: {movie_deltas}')

                window_deltas += movie_deltas

            # option 2: we are looking the first day of the observation period
            elif curr_window >= 0 and known_movie in july_sim_list[curr_window].keys():
                # print(f'Movie {known_movie} was added on July {curr_window + 1}')
                known_movie_count += 1
                # add all of the target items because the rec sys is populating this list for the first time
                movie_deltas += [movie_sim[0] for movie_sim in july_sim_list[curr_window][known_movie]]
                # print(f'Movie {known_movie} generated these related movies: {movie_deltas}')

                window_deltas += movie_deltas
            else:
                # print(f'Movie {known_movie} is not in July {curr_window + 1}\'s rec system')
                continue

        
        target_freq = Counter(window_deltas)
        # print(f'For a total of {known_movie_count} movies, Here are the frequencies for July {observation_window}: {target_freq}')
        updated_inferences = user_inferences[user]
        
        updated_inferences += ([movie for movie,count in target_freq.items() if count/known_movie_count >= threshold])
        # print(f'The inference list now looks like: {updated_inferences}')

        # removes any duplicates before saving
        user_inferences[user] = list(set(updated_inferences))


In [230]:
user_inferences

{'2369855': [3712,
  516,
  3209,
  4239,
  8975,
  3857,
  917,
  3349,
  8727,
  9109,
  2589,
  2594,
  4388,
  1702,
  1703,
  8742,
  4391,
  4011,
  2347,
  47,
  688,
  2867,
  3892,
  4148,
  3132,
  9149,
  62,
  832,
  2112,
  4419,
  9027,
  3397,
  1860,
  1095,
  9159,
  75,
  2251,
  1742,
  8783,
  2390,
  2268,
  8927,
  17769,
  620,
  4206,
  3311,
  251,
  4340,
  1782,
  3959,
  635,
  3196],
 '501823': [2, 4, 7, 11, 8651, 12, 15, 16, 17, 18, 24, 25, 27, 28],
 '1504575': [0,
  1,
  2,
  3,
  4,
  5,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  3092,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  1699,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  43,
  2861,
  44,
  321,
  8651,
  605,
  17769],
 '1333911': [0,
  1,
  2,
  3,
  4,
  8834,
  5,
  7,
  11,
  12,
  13,
  15,
  16,
  17,
  18,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  31,
  32,
  34,
  35,
  36,
  37,
  39,
  43,
  44,
  