# MS&E 234 Project

### Data cleaning

In [1]:
import pandas as pd
import numpy as np
import re
from random import sample
import itertools
from collections import defaultdict
from tqdm.notebook import tqdm
import pickle

### Load data

In [2]:
df_raw = pd.DataFrame()
for i in range(1, 4+1):
    df_raw = pd.concat([df_raw, pd.read_csv(f'../netflix-prize-kaggle-data/combined_data_{i}.txt',
        header=None,
        names=['CustomerID', 'Rating', 'Date'])])
df_raw = df_raw.reset_index(drop = True)
df_raw

Unnamed: 0,CustomerID,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26
...,...,...,...
100498272,1790158,4.0,2005-11-01
100498273,1608708,3.0,2005-07-19
100498274,234275,1.0,2004-08-07
100498275,255278,4.0,2004-05-28


### Subset data as in paper (Calandrino 2011)

In [3]:
def buildDataset(df, userList, train):
    print("Entire dataset: 100498277 ratings; 480189 users")
    # Sample 10000 users
    df = df[df['CustomerID'].isin(userList)]
    print(f"Subset of 10000 users: {df.shape[0]} ratings; {df['CustomerID'].nunique()} users")

    if train:
        # Assume each user makes a random 50% of transactions public
        idxs = df[['CustomerID']].reset_index().groupby('CustomerID').agg({'index':lambda x: list(x)}).to_numpy().tolist()
        drop_idxs = []
        for i in range(len(idxs)):
            drop_idxs.append(sample(idxs[i][0], len(idxs[i][0]) // 2))
        drop_idxs = list(itertools.chain(*drop_idxs))
        df = df.drop(drop_idxs)
        print(f'After making 50% private: {df.shape[0]} ratings; {df.CustomerID.nunique()} users')

        # Only consider users with at least 100 public transactions
        df = df[df['CustomerID'].isin(df['CustomerID'].value_counts()[df['CustomerID'].value_counts() > 100].index)]
        print(f'Filter for users with >= 100 public transactions: {df.shape[0]} ratings; {df.CustomerID.nunique()} users')

    # Subset the data to analyze only ratings from July 2005
    df = df[df['Date'].apply(lambda x: type(x) == str and bool(re.match(r'2005-07.*', x)))]
    print(f'Only July 2005: {df.shape[0]} ratings; {df["CustomerID"].nunique()} users')
          
    return df

In [4]:
# extract movie ID's from raw data, combine these with df
# The data files follow the following format:
# Movie ID:
# CustomerID, Rating, Date
# ...

def addMovieInfo(df, movieDF):
    movieRows = np.array(movieDF.index)
    movieIDs = []
    currIdx = 0
    for row in df.itertuples(index = True, name = 'Pandas'):
        currRow = row.Index
        while currIdx < len(movieRows) - 1:
            nextMovieRow = movieRows[currIdx + 1]
            if currRow > nextMovieRow:
                currIdx += 1
            else:
                break
        movieIDs.append(currIdx)
    df["MovieID"] = movieIDs
    df['Day'] = pd.DatetimeIndex(df['Date']).day
    return df

In [5]:
# input: 2 sets x, y
# output: for binary vectors x and y: cosine similarity = |x and y| / sqrt(|x||y|)
def getCosSim(x, y):
    return len(x.intersection(y)) / np.sqrt(len(x) * len(y))

In [6]:
# input: sparse binary adjList
# output: dict of the 50 most similar items and scores, in format: dict[movieID] => ([(items, scores)])
def getSimListDaily(adjList):
    simList = {}
    for key1 in adjList.keys():
        cosSims = []
        for key2 in adjList.keys():
            if key1 == key2:
                continue
            cosSim = getCosSim(adjList[key1], adjList[key2])
            cosSims.append((key2, cosSim))
        simList[key1] = sorted(cosSims, key = lambda x: (-x[1], x[0]))[:50]
    return simList

In [7]:
# input: dataframe of customerIDs, Ratings, Day of Month, and MovieIDs
# input dataframe should contain both public and private data, as the recc system uses all available information
# output: list of 50 related movies and similarity ratings per day
# on each consecutive day, more data is used by the recc system
# uses cosine similarity on a binary matrix
def getSimListMonthly(df):
    # store binary matrix in sparse adjacency list format
    # adjList[movieID] => set of customerIDs
    adjList = defaultdict(set)
    
    simLists = []
    
    for day in tqdm(sorted(df["Day"].unique())):
        # add new movies from today to adjList
        currDF = df[df["Day"] == day]
        for row in currDF.itertuples(index = True, name = 'Pandas'):
            adjList[row.MovieID].add(row.CustomerID)
        
        # compute similarity scores
        currSimList = getSimListDaily(adjList)
        simLists.append(currSimList)
    return simLists

In [8]:
movieDF = df_raw[df_raw['Rating'].isnull()]
userList = sample(df_raw.dropna()['CustomerID'].unique().tolist(), 10000)
dfTrain = buildDataset(df_raw, userList, train = True)
dfTrain = addMovieInfo(dfTrain, movieDF)
dfTest = buildDataset(df_raw, userList, train = False)
dfTest = addMovieInfo(dfTest, movieDF)
dfTrain.to_csv("train/Trial1.csv")
dfTest.to_csv("test/Trial1.csv")

Entire dataset: 100498277 ratings; 480189 users
Subset of 10000 users: 2086064 ratings; 10000 users
After making 50% private: 1045448 ratings; 10000 users
Filter for users with >= 100 public transactions: 806422 ratings; 3159 users
Only July 2005: 35238 ratings; 1999 users
Entire dataset: 100498277 ratings; 480189 users
Subset of 10000 users: 2086064 ratings; 10000 users
Only July 2005: 100667 ratings; 4439 users


In [9]:
simLists = getSimListMonthly(dfTest)

HBox(children=(IntProgress(value=0, max=31), HTML(value='')))




In [10]:
fileName = 'simLists/Trial1.pkl'
with open(fileName, 'wb') as handle:
    pickle.dump(simLists, handle)

In [11]:
with open(fileName, 'rb') as handle:
    simLists = pickle.load(handle)

In [99]:
dfTrain = pd.read_csv("train/Trial1.csv", index_col = 0)
dfTest = pd.read_csv("test/Trial1.csv", index_col = 0)

In [151]:
countDF = dfTest.groupby(["CustomerID", "Day"]).count()
print(f"number of (user, date) pairs: {countDF.shape[0]} ")
countDF = countDF[countDF["Rating"] <= 5]
print(f"(user, date) pairs with <= 5 transactions: {countDF.shape[0]}")
validUsers = set(dfTrain["CustomerID"].unique())
countDF = countDF.reset_index()
countDF = countDF[countDF["CustomerID"].isin(validUsers)]
print(f"filter by users with >= 100 transactions (in the whole dataset): {countDF.shape[0]}")
userDayPairs = countDF[["CustomerID", "Day"]].to_numpy()

# reset countDF to contain full count info
countDF = dfTest.groupby(["CustomerID", "Day"]).count()
countDF = countDF.reset_index()

number of (user, date) pairs: 14732 
(user, date) pairs with <= 5 transactions: 12233
filter by users with >= 100 transactions (in the whole dataset): 7121


In [152]:
user, day = userDayPairs[0]

In [153]:
dfTrain

Unnamed: 0,CustomerID,Rating,Date,MovieID,Day
228,1727869,5.0,2005-07-11,0,11
413,448902,5.0,2005-07-20,0,20
2170,217658,4.0,2005-07-26,2,26
5358,2116703,5.0,2005-07-06,7,6
7561,1871482,5.0,2005-07-22,7,22
...,...,...,...,...,...
100468158,2320669,5.0,2005-07-07,17763,7
100487063,1838742,4.0,2005-07-10,17763,10
100492757,2260860,3.0,2005-07-14,17768,14
100492792,814259,4.0,2005-07-18,17768,18


In [154]:
dfTrain[dfTrain["CustomerID"] == user]

Unnamed: 0,CustomerID,Rating,Date,MovieID,Day
5264117,2555,4.0,2005-07-11,1072,11
8601161,2555,3.0,2005-07-18,1718,18
64659221,2555,5.0,2005-07-25,11811,25
68169137,2555,1.0,2005-07-18,12358,18
90445360,2555,3.0,2005-07-18,16112,18


In [155]:
countDF[countDF["CustomerID"] == 2555]

Unnamed: 0,CustomerID,Day,Rating,Date,MovieID
9,2555,7,1,1,1
10,2555,11,2,2,2
11,2555,18,7,7,7
12,2555,25,2,2,2
13,2555,26,1,1,1
14,2555,29,1,1,1


In [191]:
def getScore(searchID, simList):
    for movieID, score in simList:
        if searchID == movieID:
            return score
    return np.nan

In [150]:
currSimList = simLists[18][1072]
11811 in [movieID for (movieID, score) in currSimList]

True

In [167]:
getScore(11811, simLists[23][1072])

0.20100359758437733

In [168]:
getScore(11811, simLists[23][1072])

0.20100359758437733

In [169]:
getScore(11811, simLists[24][1072])

0.22326211577552735

In [112]:
aux = dfTrain[dfTrain["CustomerID"] == user]["MovieID"].to_numpy() 
print(aux)

[ 1072  1718 11811 12358 16112]


In [211]:
def propagatedAux(user):
    # known movies watched by user during July
    aux = dfTrain[dfTrain["CustomerID"] == user]["MovieID"].to_numpy() 
    n = len(aux)
    
    # get subset of days where a rating was given by user
    validDays = countDF[countDF["CustomerID"] == 2555]["Day"]
    
    prop = dict()
    
    for day in validDays:            
        deltaMatrix = np.zeros((n, n))
        for i in range(n):
            if day > 1:
                if aux[i] not in simLists[day - 2].keys():
                    continue
                lastSimList = simLists[day - 2][aux[i]]
            currSimList = simLists[day - 1][aux[i]]
            
            for j in range(i + 1, n):
                if day == 1:
                    startScore = 0
                else:
                    startScore = getScore(aux[j], lastSimList)
                endScore = getScore(aux[j], currSimList)
                scoreDelta = endScore - startScore
                if np.isnan(scoreDelta):
                    scoreDelta = 0
                deltaMatrix[i][j] = scoreDelta
                deltaMatrix[j][i] = scoreDelta
        #print("day: ", day)
        for i, movieID in enumerate(aux):
            deltaSum = np.sum(deltaMatrix[i])
            #print(f"movie: {movieID} deltaSum: ", deltaSum)
            if deltaSum > 0:
                if movieID not in prop:
                    prop[movieID] = day
    return prop

In [218]:
def filterAux(A, day):
    newA = set()
    for key, value in A.items():
        if value <= day:
            newA.add(key)
    return newA

In [None]:
def alg2(user, day, A):
    A = filterAux(A, day)
    

day:  7
day:  11
day:  18
day:  25
day:  26
day:  29


{1072: 18, 11811: 18, 16112: 18}

In [223]:
user, day = userPairs[2]

In [224]:
propagatedAux(user)

day:  7
day:  11
day:  18
day:  25
day:  26
day:  29


{1072: 18, 11811: 18, 16112: 18}