# MS&E 234 Project

### Data cleaning

In [225]:
import pandas as pd
import numpy as np
import re
from random import sample
import itertools
from collections import defaultdict, Counter
from tqdm.notebook import tqdm
import pickle

### Load data

In [2]:
df_raw = pd.DataFrame()
for i in range(1, 4+1):
    df_raw = pd.concat([df_raw, pd.read_csv(f'../netflix-prize-kaggle-data/combined_data_{i}.txt',
        header=None,
        names=['CustomerID', 'Rating', 'Date'])])
df_raw = df_raw.reset_index(drop = True)
df_raw

Unnamed: 0,CustomerID,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26
...,...,...,...
100498272,1790158,4.0,2005-11-01
100498273,1608708,3.0,2005-07-19
100498274,234275,1.0,2004-08-07
100498275,255278,4.0,2004-05-28


### Subset data as in paper (Calandrino 2011)

In [3]:
def buildDataset(df, userList, train):
    print("Entire dataset: 100498277 ratings; 480189 users")
    # Sample 10000 users
    df = df[df['CustomerID'].isin(userList)]
    print(f"Subset of 10000 users: {df.shape[0]} ratings; {df['CustomerID'].nunique()} users")

    if train:
        # Assume each user makes a random 50% of transactions public
        idxs = df[['CustomerID']].reset_index().groupby('CustomerID').agg({'index':lambda x: list(x)}).to_numpy().tolist()
        drop_idxs = []
        for i in range(len(idxs)):
            drop_idxs.append(sample(idxs[i][0], len(idxs[i][0]) // 2))
        drop_idxs = list(itertools.chain(*drop_idxs))
        df = df.drop(drop_idxs)
        print(f'After making 50% private: {df.shape[0]} ratings; {df.CustomerID.nunique()} users')

        # Only consider users with at least 100 public transactions
        df = df[df['CustomerID'].isin(df['CustomerID'].value_counts()[df['CustomerID'].value_counts() > 100].index)]
        print(f'Filter for users with >= 100 public transactions: {df.shape[0]} ratings; {df.CustomerID.nunique()} users')

    # Subset the data to analyze only ratings from July 2005
    df = df[df['Date'].apply(lambda x: type(x) == str and bool(re.match(r'2005-07.*', x)))]
    print(f'Only July 2005: {df.shape[0]} ratings; {df["CustomerID"].nunique()} users')
          
    return df

In [4]:
# extract movie ID's from raw data, combine these with df
# The data files follow the following format:
# Movie ID:
# CustomerID, Rating, Date
# ...

def addMovieInfo(df, movieDF):
    movieRows = np.array(movieDF.index)
    movieIDs = []
    currIdx = 0
    for row in df.itertuples(index = True, name = 'Pandas'):
        currRow = row.Index
        while currIdx < len(movieRows) - 1:
            nextMovieRow = movieRows[currIdx + 1]
            if currRow > nextMovieRow:
                currIdx += 1
            else:
                break
        movieIDs.append(currIdx)
    df["MovieID"] = movieIDs
    df['Day'] = pd.DatetimeIndex(df['Date']).day
    return df

In [5]:
# input: 2 sets x, y
# output: for binary vectors x and y: cosine similarity = |x and y| / sqrt(|x||y|)
def getCosSim(x, y):
    return len(x.intersection(y)) / np.sqrt(len(x) * len(y))

In [6]:
# input: sparse binary adjList
# output: dict of the 50 most similar items and scores, in format: dict[movieID] => ([(items, scores)])
def getSimListDaily(adjList):
    simList = {}
    for key1 in adjList.keys():
        cosSims = []
        for key2 in adjList.keys():
            if key1 == key2:
                continue
            cosSim = getCosSim(adjList[key1], adjList[key2])
            cosSims.append((key2, cosSim))
        simList[key1] = sorted(cosSims, key = lambda x: (-x[1], x[0]))[:50]
    return simList

In [7]:
# input: dataframe of customerIDs, Ratings, Day of Month, and MovieIDs
# input dataframe should contain both public and private data, as the recc system uses all available information
# output: list of 50 related movies and similarity ratings per day
# on each consecutive day, more data is used by the recc system
# uses cosine similarity on a binary matrix
def getSimListMonthly(df):
    # store binary matrix in sparse adjacency list format
    # adjList[movieID] => set of customerIDs
    adjList = defaultdict(set)
    
    simLists = []
    
    for day in tqdm(sorted(df["Day"].unique())):
        # add new movies from today to adjList
        currDF = df[df["Day"] == day]
        for row in currDF.itertuples(index = True, name = 'Pandas'):
            adjList[row.MovieID].add(row.CustomerID)
        
        # compute similarity scores
        currSimList = getSimListDaily(adjList)
        simLists.append(currSimList)
    return simLists

In [8]:
movieDF = df_raw[df_raw['Rating'].isnull()]
userList = sample(df_raw.dropna()['CustomerID'].unique().tolist(), 10000)
dfTrain = buildDataset(df_raw, userList, train = True)
dfTrain = addMovieInfo(dfTrain, movieDF)
dfTest = buildDataset(df_raw, userList, train = False)
dfTest = addMovieInfo(dfTest, movieDF)
dfTrain.to_csv("train/Trial1.csv")
dfTest.to_csv("test/Trial1.csv")

Entire dataset: 100498277 ratings; 480189 users
Subset of 10000 users: 2086064 ratings; 10000 users
After making 50% private: 1045448 ratings; 10000 users
Filter for users with >= 100 public transactions: 806422 ratings; 3159 users
Only July 2005: 35238 ratings; 1999 users
Entire dataset: 100498277 ratings; 480189 users
Subset of 10000 users: 2086064 ratings; 10000 users
Only July 2005: 100667 ratings; 4439 users


In [9]:
simLists = getSimListMonthly(dfTest)

HBox(children=(IntProgress(value=0, max=31), HTML(value='')))




In [10]:
fileName = 'simLists/Trial1.pkl'
with open(fileName, 'wb') as handle:
    pickle.dump(simLists, handle)

In [11]:
with open(fileName, 'rb') as handle:
    simLists = pickle.load(handle)

In [99]:
dfTrain = pd.read_csv("train/Trial1.csv", index_col = 0)
dfTest = pd.read_csv("test/Trial1.csv", index_col = 0)

In [524]:
countDF = dfTest.groupby(["CustomerID", "Day"]).count()
print(f"number of (user, date) pairs: {countDF.shape[0]} ")
countDF = countDF[countDF["Rating"] <= 5]
print(f"(user, date) pairs with <= 5 transactions: {countDF.shape[0]}")
validUsers = set(dfTrain["CustomerID"].unique())
countDF = countDF.reset_index()
countDF = countDF[countDF["CustomerID"].isin(validUsers)]
#monthlyCountDF = dfTest.groupby(["CustomerID"]).count()
print(f"filter by users with >= 100 transactions (in the whole dataset): {countDF.shape[0]}")
print("final transaction count: ", countDF["Rating"].sum())
#validUsers = set(monthlyCountDF[(monthlyCountDF["Rating"] > 100).values].index)
#countDF = countDF[countDF["CustomerID"].isin(validUsers)]
#print(f"filter by users with >= 100 transactions (July): {countDF.shape[0]}")
userDayPairs = countDF[["CustomerID", "Day"]].to_numpy()

# reset countDF to contain full count info
countDF = dfTest.groupby(["CustomerID", "Day"]).count()
countDF = countDF.reset_index()

number of (user, date) pairs: 14732 
(user, date) pairs with <= 5 transactions: 12233
filter by users with >= 100 transactions (in the whole dataset): 7121
final transaction count:  13367


In [481]:
print("> 100 txn users (full dataset): ", len(set(dfTrain["CustomerID"].unique())))
monthlyCountDF = dfTest.groupby(["CustomerID"]).count()
print("> 100 txn users (July): ", len(set(monthlyCountDF[(monthlyCountDF["Rating"] > 100).values].index)))

>100 txn users (full dataset):  1999
>100 txn users (July):  197


In [499]:
len(userPairs)

7121

In [500]:
len(np.unique(userDayPairs[:, 0]))

1826

In [520]:
countDF = dfTest.groupby(["CustomerID", "Day"]).count()
print(f"number of (user, date) pairs: {countDF.shape[0]} ")
countDF = countDF[countDF["Rating"] <= 5]
print(f"(user, date) pairs with <= 5 transactions: {countDF.shape[0]}")
validUsers = set(dfTrain["CustomerID"].unique())
countDF = countDF.reset_index()
countDF = countDF[countDF["CustomerID"].isin(validUsers)]

number of (user, date) pairs: 14732 
(user, date) pairs with <= 5 transactions: 12233


In [501]:
def getScore(searchID, simList):
    for movieID, score in simList:
        if searchID == movieID:
            return score
    return np.nan

In [260]:
def propagatedAux(user):
    # known movies watched by user during July
    aux = dfTrain[dfTrain["CustomerID"] == user]["MovieID"].to_numpy() 
    n = len(aux)
    
    # get subset of days where a rating was given by user
    validDays = countDF[countDF["CustomerID"] == user]["Day"]
    
    prop = dict()
    
    for day in validDays:            
        deltaMatrix = np.zeros((n, n))
        for i in range(n):
            if day > 1:
                if aux[i] not in simLists[day - 2].keys():
                    continue
                lastSimList = simLists[day - 2][aux[i]]
            currSimList = simLists[day - 1][aux[i]]
            
            for j in range(i + 1, n):
                if day == 1:
                    startScore = 0
                else:
                    startScore = getScore(aux[j], lastSimList)
                endScore = getScore(aux[j], currSimList)
                scoreDelta = endScore - startScore
                if np.isnan(scoreDelta):
                    scoreDelta = 0
                deltaMatrix[i][j] = scoreDelta
                deltaMatrix[j][i] = scoreDelta
        #print("day: ", day)
        for i, movieID in enumerate(aux):
            deltaSum = np.sum(deltaMatrix[i])
            #print(f"movie: {movieID} deltaSum: ", deltaSum)
            if deltaSum > 0:
                if movieID not in prop:
                    prop[movieID] = day
    return prop

In [218]:
def filterAux(A, day):
    newA = set()
    for key, value in A.items():
        if value <= day:
            newA.add(key)
    return newA

In [512]:
def getSupports(A, day, T):
    TScores = Counter()
        
    for movieID in tqdm(A):
        if day > 1:
            if movieID not in simLists[day - 2].keys():
                continue
            lastSimList = simLists[day - 2][movieID]
        else:
            if movieID not in simLists[day - 1].keys():
                continue
        currSimList = simLists[day - 1][movieID]
        
        for movieID2 in T:
            if day > 1:
                if movieID2 not in simLists[day - 2].keys():
                    continue
            else:
                if movieID2 not in simLists[day - 1].keys():
                    continue
            if day == 1:
                startScore = 0
            else:
                startScore = getScore(movieID2, lastSimList)
                if np.isnan(startScore):
                    startScore = getScore(movieID, simLists[day - 2][movieID2])
            endScore = getScore(movieID2, currSimList)
            if np.isnan(startScore):
                startScore = getScore(movieID, simLists[day - 1][movieID2])

            scoreDelta = endScore - startScore
            if np.isnan(scoreDelta):
                scoreDelta = 0
            if np.isnan(startScore) and not np.isnan(endScore):
                TScores[movieID2] += 1
            elif scoreDelta > 0:
                TScores[movieID2] += 1
        
    return TScores

def alg2(fullAux, propAux, day, T, scoreThreshold = 0, supportThreshold = 1):
    T = T - fullAux
    A = filterAux(propAux, day)
    n = len(A)
    
    inferSet = set()

    if n == 0:
        return inferSet
    
    supports = getSupports(A, day, T)
    for movieID in supports:
        support = supports[movieID]
        if support / n > scoreThreshold and support >= supportThreshold:
            inferSet.add(movieID)
    return inferSet, supports

In [513]:
user, day = userPairs[25]

In [516]:
T = set(dfTest["MovieID"].unique())
fullAux = set(dfTrain[dfTrain["CustomerID"] == user]["MovieID"])
soln = set(dfTest[(dfTest["CustomerID"] == user) & (dfTest["Day"] == day)]["MovieID"])
propAux = dict(dfTrain[dfTrain["CustomerID"] == user][["MovieID", "Day"]].to_numpy())

print("fullAux: ", len(fullAux))
print("propAux: ", len(filterAux(propAux, day)))
print("day: ", day)
print("soln (before subtracting): ", soln)
soln = soln - fullAux
    
inferSet, supports = alg2(fullAux, propAux, day, T, scoreThreshold = 0.1)
print("soln size: ", len(soln))
print("infer size: ", len(inferSet))
print("soln: ", soln)
print("infer: ", inferSet)

if len(inferSet) > 0:
    print("user, day: ", user, day)
    print(f"percent inferences: {len(soln.intersection(inferSet))} / {len(inferSet)}")
    print(f"num solutions correct: {len(soln.intersection(inferSet))} / {len(soln)}")

fullAux:  308
propAux:  307
day:  27
soln (before subtracting):  {14678, 13181, 7750, 8831}


HBox(children=(IntProgress(value=0, max=307), HTML(value='')))


soln size:  4
infer size:  3
soln:  {7750, 13181, 14678, 8831}
infer:  {9761, 6146, 13181}
user, day:  10679 27
percent inferences: 1 / 3
num solutions correct: 1 / 4


In [377]:
T = set(dfTest["MovieID"].unique())

lastUser = None
for user, day in tqdm(userPairs):
    fullAux = set(dfTrain[dfTrain["CustomerID"] == user]["MovieID"])
    soln = set(dfTest[(dfTest["CustomerID"] == user) & (dfTest["Day"] == day)]["MovieID"])
    soln = soln - fullAux
    
    filteredAux = filterAux(propAux, day)
    if filteredAux == 0:
        continue
    
    if len(soln) == 0:
        continue
    
    if user != lastUser:
        print("user: ", user)
        #propAux = propagatedAux(user)
        propAux = dict(dfTrain[dfTrain["CustomerID"] == user][["MovieID", "Day"]].to_numpy())
        lastUser = user
    
    inferSet = alg2(fullAux, propAux, day, T)
    #print("soln size: ", len(soln))
    if len(inferSet) > 0:
        print("user, day: ", user, day)
        print(f"percent inferences: {len(soln.intersection(inferSet))} / {len(inferSet)}")
        print(f"num solutions correct: {len(soln.intersection(inferSet))} / {len(inferSet)}")

HBox(children=(IntProgress(value=0, max=7121), HTML(value='')))

user:  2555
user, day:  2555 11
percent inferences: 0 / 36
num solutions correct: 0 / 36
user, day:  2555 25
percent inferences: 0 / 12
num solutions correct: 0 / 12
user, day:  2555 26
percent inferences: 0 / 3
num solutions correct: 0 / 3
user, day:  2555 29
percent inferences: 0 / 13
num solutions correct: 0 / 13
user:  4066
user:  9321
user, day:  9321 12
percent inferences: 0 / 4
num solutions correct: 0 / 4
user, day:  9321 20
percent inferences: 1 / 1
num solutions correct: 1 / 1
user, day:  9321 21
percent inferences: 0 / 1
num solutions correct: 0 / 1
user:  9636
user:  10308
user, day:  10308 8
percent inferences: 0 / 28
num solutions correct: 0 / 28
user:  10679


KeyboardInterrupt: 

In [258]:
set(dfTrain[dfTrain["CustomerID"] == user]["MovieID"])

{1072, 1718, 11811, 12358, 16112}

In [281]:
dfTrain[(dfTrain["CustomerID"] == user) & (dfTrain["Day"] == day)]

Unnamed: 0,CustomerID,Rating,Date,MovieID,Day
64659221,2555,5.0,2005-07-25,11811,25


In [282]:
dfTest[(dfTest["CustomerID"] == user) & (dfTest["Day"] == day)]

Unnamed: 0,CustomerID,Rating,Date,MovieID,Day
4015745,2555,3.0,2005-07-25,759,25
64659221,2555,5.0,2005-07-25,11811,25


In [284]:
soln - fullAux

{759}