# MS&E 234 Project

### Data cleaning

In [1]:
import pandas as pd
import numpy as np
import re
from random import sample
import os
import itertools
from collections import defaultdict,Counter
from tqdm.notebook import tqdm
import pickle

### Step 1: Load data

In [6]:
df_raw = pd.DataFrame()
path = os.getcwd()
for i in range(1, 4+1):
    df_raw = pd.concat([df_raw, pd.read_csv(f'{path}/netflix-prize-kaggle-data/combined_data_{i}.txt',
        header=None,
        names=['CustomerID', 'Rating', 'Date'])])
df_raw = df_raw.reset_index(drop = True)
df_raw

Unnamed: 0,CustomerID,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26
...,...,...,...
100498272,1790158,4.0,2005-11-01
100498273,1608708,3.0,2005-07-19
100498274,234275,1.0,2004-08-07
100498275,255278,4.0,2004-05-28


### Step 2: Generate Training & Testing Datasets

In [21]:
def generate_dataset(df, users, train):
    df = df[df['CustomerID'].isin(users)]
    print(f"Subset of 10000 users: {df.shape[0]} ratings; {df['CustomerID'].nunique()} users")

    if (train == True):
        # assume 50% of the transactions are public
        idxs = df[['CustomerID']].reset_index().groupby('CustomerID').agg({'index':lambda x: list(x)}).to_numpy().tolist()
        drop_idxs = []
        for i in range(len(idxs)):
            drop_idxs.append(sample(idxs[i][0], len(idxs[i][0]) // 2))
        drop_idxs = list(itertools.chain(*drop_idxs))
        df = df.drop(drop_idxs)
        print(f'After making 50% private: {df.shape[0]} ratings; {df.CustomerID.nunique()} users')
    
        # only consider users with at least 100 public transactions
        df = df[df['CustomerID'].isin(df['CustomerID'].value_counts()[df['CustomerID'].value_counts() >= 100].index)]
        print(f'Filter for users with >= 100 public transactions: {df.shape[0]} ratings; {df.CustomerID.nunique()} users')

    # Subset the data to analyze only ratings from July 2005
    df = df[df['Date'].apply(lambda x: type(x) == str and bool(re.match(r'2005-07.*', x)))]
    print(f'Only July 2005: {df.shape[0]} ratings; {df["CustomerID"].nunique()} users')

    return df

In [4]:
# extract movie ID's from raw data, combine these with df
# The data files follow the following format:
# Movie ID:
# CustomerID, Rating, Date
# ...
def add_movie_info(df, movie_df):
    # get movie id boundaries
    movie_rows = np.array(movie_df.index)

    # create a column with movie ids to add to df
    movie_ids = []
    curr_idx = 0
    for row in df.itertuples(index = True, name = 'Pandas'):
        curr_row = row.Index
        while curr_idx < len(movie_rows) - 1:
            next_movie_row = movie_rows[curr_idx + 1]
            if curr_row > next_movie_row:
                curr_idx += 1
            else:
                break
        movie_ids.append(curr_idx + 1) # since index of movie IDs starts at 1
    
    # add movie ids and days
    df['MovieID'] = movie_ids
    df['Day'] = pd.DatetimeIndex(df['Date']).day

    return df

In [5]:
# get movie ids
movie_df = df_raw[df_raw['Rating'].isnull()]

In [30]:
# generate a random sample of users
users = sample(df_raw.dropna()['CustomerID'].unique().tolist(), 10000)
train = generate_dataset(df_raw, users, train=True)
train = add_movie_info(train, movie_df)

# get test data with users that have over 100 public transactions
valid_users = set(train['CustomerID'].unique())
test = generate_dataset(df_raw, users, train=False)
test = test[test['CustomerID'].isin(valid_users)]
test = add_movie_info(test, movie_df)

Subset of 10000 users: 2090975 ratings; 10000 users
After making 50% private: 1047993 ratings; 10000 users
Filter for users with >= 100 public transactions: 812618 ratings; 3197 users
Only July 2005: 35224 ratings; 2027 users
Subset of 10000 users: 2090975 ratings; 10000 users
Only July 2005: 101093 ratings; 4493 users


In [32]:
# testing to see double the number of transactions but the same users
print(len(train))
print(len(test))
print(train['CustomerID'].nunique())
print(test['CustomerID'].nunique())

35224
70004
2027
2027


In [33]:
# save files for easy loading
train.to_csv(path + '/train.csv')
test.to_csv(path + '/test.csv')

### Step 3: Generate Related Movies' Lists

In [2]:
# input: 2 sets x, y
# output: for binary vectors x and y: cosine similarity = |x and y| / sqrt(|x||y|)
def getCosSim(x, y):
    return len(x.intersection(y)) / np.sqrt(len(x) * len(y))

In [3]:
# input: sparse binary adjList
# output: dict of the 50 most similar items and scores, in format: dict[movieID] => ([(items, scores)])
def getSimListDaily(adjList):
    simList = {}
    for key1 in adjList.keys():
        cosSims = []
        for key2 in adjList.keys():
            if key1 == key2:
                continue
            cosSim = getCosSim(adjList[key1], adjList[key2])
            cosSims.append((key2, cosSim))
        simList[key1] = sorted(cosSims, key = lambda x: (-x[1], x[0]))[:50]
    return simList

In [36]:
# input: dataframe of customerIDs, Ratings, Day of Month, and MovieIDs
# input dataframe should contain both public and private data, as the recc system uses all available information
# output: list of 50 related movies and similarity ratings per day
# on each consecutive day, more data is used by the recc system
# uses cosine similarity on a binary matrix
def getSimListMonthly(df):
    adjList = defaultdict(set)
    
    simLists = []
    
    for day in sorted(df["Day"].unique()):
        # add new movies from today to adjList
        currDF = df[df["Day"] == day]
        for row in currDF.itertuples(index = True, name = 'Pandas'):
            adjList[row.MovieID].add(row.CustomerID)
        
        # compute similarity scores
        currSimList = getSimListDaily(adjList)
        simLists.append(currSimList)
    return simLists

In [37]:
sim_list = getSimListMonthly(test)

In [39]:
# shows the top 5 movies related to movie id (second value) on the last day of July 2005
sim_list[30][30][0:5]

[(2580, 0.3698977214370539),
 (1542, 0.3501746458678807),
 (1180, 0.3206947914353012),
 (758, 0.3125),
 (2391, 0.3125)]

In [40]:

# save sim_list for easy usage
sim_list_file = path + '/sim_list.pkl'
with open(sim_list_file, 'wb') as f:
    pickle.dump(sim_list, f)

## Part 3: Inference Algorithm

### Questions, TODOs, Comments
- For the first observation window, should we be adding all 50 related movies to the window? Maybe we should set a similarity threshold for the value?
- Should this be per transaction or per film (some users watch the same film multiple times)?
- How should we handle auxiliary items that end up back in the inferred items?

In [2]:
# read in files
path = os.getcwd()
train = pd.read_csv(path + '/train.csv', index_col=0)
test = pd.read_csv(path + '/test.csv', index_col=0)

sim_list_file = path + '/sim_list.pkl'
sim_list = None
with open(sim_list_file, 'rb') as f:
    sim_list = pickle.load(f)

In [3]:
print(f'Train set: {len(train)} transactions')
print(f'Test set: {len(test)} transactions')

Train set: 35224 transactions
Test set: 70004 transactions


In [4]:

# group data into transactions -> (user, day)
transaction_counts = test.groupby(['CustomerID', 'Day']).count()
print(f'Number of transactions: {len(transaction_counts)}')

# remove users with more than 5 transactions
transaction_counts = transaction_counts[transaction_counts['Rating']  <= 5]
print(f'Number of (customer, date) with <= 5 occurrences: {len(transaction_counts)}')
transaction_counts = transaction_counts.reset_index()

# reset the indices
train = train.reset_index()
test = test.reset_index()

print(f'Train set with users with <= 5 transactions/day: {len(train)} transactions')
print(f'Test set with users with have <= 5 transactions/day: {len(test)} transactions')


Number of transactions: 9040
Number of (customer, date) with <= 5 occurrences: 7336
Train set with users with <= 5 transactions/day: 35224 transactions
Test set with users with have <= 5 transactions/day: 70004 transactions


In [5]:
users = train['CustomerID'].unique()
sample_users = users[10:15]

In [34]:
inferences = {}
observation_period = 1
threshold = 0.5

In [35]:
for user in users: 
    # get transactions such that user has no more than 5 per day
    valid_days = transaction_counts[transaction_counts['CustomerID'] == user]['Day'].values
    
    if len(valid_days) == 0:
        continue

    # print(valid_days)
    valid_transactions = train[(train['CustomerID'] == user) & (train['Day'].isin(valid_days))]
    # print(valid_transactions)
    
    # get auxillary information
    aux = set()
    aux_idx = list(valid_transactions['MovieID'].unique())
    # print(f'Mapping for movie indices: {aux_idx}')
    num_aux = len(aux_idx)

    # setup delta matrix + inferences
    delta_matrix = {}
    inferences[user] = set()

    for day in range(min(valid_days), 31, observation_period):
        # check to see if there are new movies to add
        if day in valid_days:
            new_movies = list(valid_transactions[valid_transactions['Day'] == day]['MovieID'])
            aux.update(new_movies)
            # print(f'On day {day}, user {user} watched {aux}.')

        # get related list for this movie on this day
        for movie in aux:
            # get related items for each movie
            movie_sim_list = sim_list[day][movie]
            target_items = [related_movie[0] for related_movie in movie_sim_list]
            # print(f'These are the target items for Movie {movie}: {target_items}')
            # get index for accessing matrix
            movie_idx = aux_idx.index(movie)
            # print(f'This is the index for Movie {movie}: {movie_idx}')
            
            # iterate through the related items and update the position in each array
            for new_pos, item in enumerate(target_items):
                if item not in delta_matrix:
                    delta_matrix[item] = [()] * num_aux
                    delta_matrix[item][movie_idx] = (new_pos + 1, new_pos + 1)
                # print(f'Movie {item} was added to the matrix: {delta_matrix}')
                elif item in delta_matrix and len(delta_matrix[item][movie_idx]) != 0:
                    prev_pos = delta_matrix[item][movie_idx][0]
                    pos_change = prev_pos - (new_pos + 1)
                    delta_matrix[item][movie_idx] = (new_pos + 1, pos_change)
                else:
                    delta_matrix[item][movie_idx] = (new_pos + 1, new_pos + 1)
            
            dropped_movies = [item for item in delta_matrix if item not in target_items and len(delta_matrix[item][movie_idx]) != 0]
            for dropped_movie in dropped_movies:
                delta_matrix[dropped_movie][movie_idx] = ()
            # print(f'Movie {movie} has these related items: {target_items}')
            # print(f'Movie {movie} generated this delta matrix: {delta_matrix}')
            # print(f'Movie {movie} dropped these items: {dropped_movies}')

        # generate scores for each item
        for target in delta_matrix:
            target_changes = [delta for delta in delta_matrix[target] if len(delta) != 0]

            if len(target_changes) != 0:
                num_pos_changes = [pos for pos in target_changes if pos[1] > 0]
                # print(f'Target movie {target} has the following changes: {num_pos_changes}')
                if (len(num_pos_changes)/num_aux) >= threshold:
                    inferences[user].add(target)
        # print(f'This is the resulting inferences for user {user}: {inferences[user]}')


In [36]:
inferences

{2422360: set(),
 2147714: {30,
  197,
  299,
  313,
  329,
  334,
  406,
  468,
  571,
  607,
  758,
  851,
  886,
  985,
  1073,
  1110,
  1144,
  1145,
  1180,
  1220,
  1256,
  1307,
  1406,
  1428,
  1642,
  1719,
  1865,
  1905,
  1962,
  2095,
  2112,
  2122,
  2152,
  2172,
  2192,
  2200,
  2290,
  2342,
  2372,
  2391,
  2430,
  2452,
  2782,
  2862,
  2913,
  3079,
  3106,
  3138,
  3151,
  3282,
  3290,
  3333,
  3427,
  3538,
  3610,
  3624,
  3756,
  3825,
  3860,
  3864,
  3917,
  3925,
  3938,
  3962,
  4123,
  4306,
  4315,
  4432,
  8687,
  8782,
  8904,
  8915,
  9051,
  9111,
  9170,
  9188},
 357140: {30,
  607,
  758,
  1110,
  1220,
  1307,
  1428,
  1905,
  1962,
  2342,
  2372,
  2391,
  2452,
  2782,
  2862,
  2913,
  3106,
  3151,
  3282,
  3333,
  3624,
  3860,
  3917,
  3938,
  3962,
  4123,
  4432,
  8782},
 1382654: {696, 1151, 1160, 1170, 2839, 2872, 2883, 3081, 3510, 8728, 9097},
 2537543: {391, 519, 553, 640, 702, 718, 956, 1084, 1121, 1139, 1182, 1685

In [25]:
train.head()

Unnamed: 0,index,CustomerID,Rating,Date,MovieID,Day
0,5281,2422360,4.0,2005-07-09,8,9
1,10484,2147714,3.0,2005-07-23,8,23
2,12624,357140,5.0,2005-07-27,8,27
3,14236,1382654,4.0,2005-07-08,8,8
4,20965,2273335,4.0,2005-07-07,12,7


In [24]:
test[test['index']==5281] 

Unnamed: 0,index,CustomerID,Rating,Date,MovieID,Day
3,5281,2422360,4.0,2005-07-09,8,9


In [41]:
accuracy = {}

In [44]:
# determine accuracy
for user in users:
    if len(inferences[user]) != 0 and user in inferences:
        total_correct = 0
        # get all transactions related to users
        all_transactions = test[test['CustomerID'] == user]
        
        # get non-public movies
        known_transactions = train[train['CustomerID'] == user]['index']
        unknown_transactions = all_transactions[~all_transactions['index'].isin(known_transactions)]['MovieID']
        unknown_transactions = set(unknown_transactions)

        for movie in unknown_transactions:
            if movie in inferences[user]:
                total_correct += 1
        
        accuracy[user] = total_correct/len(inferences[user])


KeyError: 2273335

In [43]:
accuracy.items()

dict_items([(1871918, 0.0), (1900424, 0.05357142857142857), (1062416, 0.0), (1985031, 0.1206896551724138)])

### Old Inference Code

In [None]:
user_inferences = {}
observation_window = 1
threshold = 0.50 # a movie is considered an inference if it shows up in majority of the similarity lists for the auxiliary information

In [None]:
# generate list of unique user ids
users = train['CustomerID'].unique()
sample_users = users[0:5]

In [None]:
for user in users:
    user_inferences[user] = []
    auxiliary_info = df.loc[df['CustomerID'] == user, 'MovieID'].unique()
    known_movie_count = 0
    for curr_window in range(0, 31, observation_window):
        window_deltas = []
        # print(f'Current window: {curr_window}')
        for known_movie in auxiliary_info:
            movie_deltas = []
            prev_window = curr_window - observation_window
            # print(f'Prev window: {prev_window}')
            # option 1: the movie has been seen by the rec system before in the daily rankings
            if curr_window > 0 and known_movie in july_sim_list[prev_window].keys():
                # print(f'Movie {known_movie} is in July {curr_window + 1}\'s rec system')
                known_movie_count += 1
                # get movie rankings
                prev_rankings = [movie_sim[0] for movie_sim in july_sim_list[prev_window][known_movie]]
                curr_rankings = [movie_sim[0] for movie_sim in july_sim_list[curr_window][known_movie]]
                # print(f'Movie #{known_movie} previous rankings: {prev_rankings}')
                # print(f'Movie #{known_movie} current rankings: {curr_rankings}')

                # get movies in both simlarity list for the previous window and current window
                target_movies = set(curr_rankings).intersection(set(prev_rankings))
                # print(f'Movie #{known_movie} target movies: {target_movies}')
                
                # add any movies that were added to rankings for the current window
                movie_deltas = [movie for movie in curr_rankings if movie not in target_movies]
                # print(f'Movie #{known_movie} new movies added to ranking: {movie_deltas}')

                # get movies that increased their similarity from the previous window
                movie_deltas += [movie for movie in target_movies if movie in curr_rankings and (curr_rankings.index(movie) - prev_rankings.index(movie)) < 0] # note: higher index means moving down the ranking
                # print(f'Movie #{known_movie} had the following increases: {movie_deltas}')

                window_deltas += movie_deltas

            # option 2: we are looking the first day of the observation period
            elif curr_window >= 0 and known_movie in july_sim_list[curr_window].keys():
                # print(f'Movie {known_movie} was added on July {curr_window + 1}')
                known_movie_count += 1
                # add all of the target items because the rec sys is populating this list for the first time
                movie_deltas += [movie_sim[0] for movie_sim in july_sim_list[curr_window][known_movie]]
                # print(f'Movie {known_movie} generated these related movies: {movie_deltas}')

                window_deltas += movie_deltas
            else:
                # print(f'Movie {known_movie} is not in July {curr_window + 1}\'s rec system')
                continue

        
        target_freq = Counter(window_deltas)
        # print(f'For a total of {known_movie_count} movies, Here are the frequencies for July {observation_window}: {target_freq}')
        updated_inferences = user_inferences[user]
        
        updated_inferences += ([movie for movie,count in target_freq.items() if count/known_movie_count >= threshold])
        # print(f'The inference list now looks like: {updated_inferences}')

        # removes any duplicates before saving
        user_inferences[user] = list(set(updated_inferences))


In [None]:
user_inferences