In [1]:
import pyspark

In [2]:
sc = pyspark.SparkContext('local[*]')



## Part 2: Distances

#### Assuming that the similarity is between lists of 1 or 0 as we see when we compare 2 documents for K shingles against document matrix. Because the Jaccard for normal sets and the sets for shingles are different in implementation I have chose to implement the one we've done in class.

In [3]:
A = [0,1,0,0,1,1,0,1,1,0,1,0,1,1,0,1,0]
B = [1,0,0,1,1,1,1,1,0,0,1,0,1,0,1,0,1]

In [4]:
def JaccardDistance(list_1, list_2):
    mergedList = tuple(zip(list_1, list_2))
    allTuples = sc.parallelize(mergedList)
    
    unionTuples = allTuples.filter(lambda x: x[0] == 1 or x[1] == 1)
    intersectionTuples = unionTuples.filter(lambda x: x[0] != 0 and x[1] != 0)
    return 1 - (len(intersectionTuples.collect())/len(unionTuples.collect()))

JaccardDistance(A,B)

0.6428571428571428

In [5]:
def HammingDistance(list_1, list_2):
    mergedList = tuple(zip(list_1, list_2))
    allTuples = sc.parallelize(mergedList)
    
    sameTuples = allTuples.filter(lambda x: x[0] != x[1])
    return len(sameTuples.collect())
#     return (len(sameTuples.collect())/len(allTuples.collect()))

HammingDistance(A,B)

9

In [6]:
import numpy as np
def CosineDistance(list_1, list_2):
    mergedList = tuple(zip(list_1, list_2))
    allTuples = sc.parallelize(mergedList)
    
    numerator = allTuples.map(lambda x: x[0]*x[1]).sum()
    denominator = np.sqrt(allTuples.map(lambda x: x[0]*x[0]).sum()) * np.sqrt(allTuples.map(lambda x: x[1]*x[1]).sum())
    return 1 - (numerator/denominator)

CosineDistance(A,B)

0.4729537233052701

## Part 3: Shingles

In [7]:
import re
from collections import Counter

TEXT = """New quarantine hobbies unearthed new passions, bringing literal silver lining. year, backyard archaeologists united kingdom recorded discoveries 47,000 objects, british museum announced week. regular people found vast majority historical artifacts traversing countryside metal detectors, adding updating records museum's portable antiquities scheme. british museum said program also saw uptick people updating digital records antiquities country full lockdown march 22 may 13. database contains records 1.5 million objects discovered since 1998 general public rather professional archaeologists. "it brilliant see scheme growing strength strength lockdown thanks garden discoveries digital reporting," said uk culture minister caroline dinenage, news release. list garden treasures dug year includes 13th century medieval seal, bearing latin inscription reading "david, god's messenger, bishop st. andrews." major coin hoard, held 63 gold coins one silver coin featuring monarchs edward iv henry viii, likely buried 16th century. included coins bearing initials several henry viii's wives, including catherine aragon, anne boleyn jane seymour. nearly 500 years later covid-19 pandemic, residents rediscovered weeding garden. another amateur find pandemic ancient roman furniture fitting made copper alloy, clearly featuring face god oceanus. artifact, found old basing 50 miles southeast london, dates far back 1st century. new report pas shows 81,602 objects added scheme 2019, recent spate lockdown treasure hunting, items coming public ownership. local authority defines object treasure, takes find british museum valued. government pays fair market price discoverer. law intended allow national local museums ultimately acquire historic treasures overall public benefit. even pandemic portable antiquities scheme's liaison officers able reach finders obtain relics significance, michael lewis, heads program, said news release. mission continues "ensure finds, important understanding britain's past, lost instead recorded posterity," said.  another person familiar matter, also confirmed demand vaccine authorized end friday, said president donald trump venting fda chief since vaccine rolled uk earlier week. two men call friday morning. white house official said comment private conversations chief "regularly requests updates progress toward vaccine." hahn quickly disputed description conversation, first reported washington post, news likely raise additional questions extent trump administration political interests involved vaccine authorization process, could undermine public confidence effort.vaccine advisers fda voted thursday recommend agency grant emergency use authorization vaccine, it's expected authorized imminently."this untrue representation phone call chief staff. fda encouraged continue working expeditiously pfizer-biontech's (emergency use authorization) request," hahn said statement friday afternoon. "fda committed issuing authorization quickly, noted statement morning." public health experts fearful along white house officials would put undue pressure authorization process and, turn, compromise public confidence vaccine, source close white house coronavirus task force told cnn. source said it's unclear meadows would make threat late process authorization vaccine expected moment. person added authorization could come end friday. dr. moncef slaoui, head us government's effort develop vaccine covid-19, told cnn's jake tapper "the lead" friday concerned reports' potential effect undermining confidence vaccine's safety. president-elect joe biden, remarks friday afternoon event announcing additional top administration picks, address news meadows demand hahn urged public faith vaccine expressed gratitude "to scientists public experts evaluated safety efficiency, free political influence." "i want make clear public, confidence -- political influence," biden said. "these first-rate scientists taking time looking elements need looked at. scientific integrity led us point." investigators manhattan district attorney's office interviewed several employees president donald trump's lender insurer recent weeks part wide-ranging investigation trump organization, according multiple people familiar investigation. two employees deutsche bank, loaned $300 million trump organization, interviewed prosecutors, according sources familiar matter. questioning specific bank's dealing trump organization president, people said, one person adding beginning process. additional interviews expected near future, said. prosecutors also interviewed least one employee aon, insurance broker done work president's company, according one source familiar matter. spokeswoman aon confirmed company received subpoena said cooperating investigation. spokeswoman declined comment employee interviews. representatives deutsche bank district attorney's office, led cyrus vance, also declined comment. deutsche bank subpoenaed part investigation last year said cooperates authorized investigations. new york times first reported interviews deutsche bank aon employees. court filings district attorney's office suggested inquiry could involve tax fraud, insurance fraud schemes defraud lenders. also recently subpoenaed trump organization records relating fees paid consultants, including payment made company controlled president's daughter, ivanka trump, according people familiar matter. specifically, cohen alleged president inflated value assets times, including 2014 trump submitted documents deutsche bank part attempt bid buffalo bills football team. trump never loan. cohen pleaded guilty federal crimes, including campaign finance charges facilitating hush-money payments silence two woman's allegations affairs trump. trump denied affairs. cohen serving three year prison sentence released home confinement earlier year due coronavirus pandemic."""
WORDS = re.findall(r'\w+', TEXT)
TEXT = ' '.join(WORDS)
TEXT

'New quarantine hobbies unearthed new passions bringing literal silver lining year backyard archaeologists united kingdom recorded discoveries 47 000 objects british museum announced week regular people found vast majority historical artifacts traversing countryside metal detectors adding updating records museum s portable antiquities scheme british museum said program also saw uptick people updating digital records antiquities country full lockdown march 22 may 13 database contains records 1 5 million objects discovered since 1998 general public rather professional archaeologists it brilliant see scheme growing strength strength lockdown thanks garden discoveries digital reporting said uk culture minister caroline dinenage news release list garden treasures dug year includes 13th century medieval seal bearing latin inscription reading david god s messenger bishop st andrews major coin hoard held 63 gold coins one silver coin featuring monarchs edward iv henry viii likely buried 16th c

In [8]:
def getShingles(TEXT, K=5):
    shingles = set()
    for i in range(len(TEXT)-K):
        shingles.add(TEXT[i:i+K])
    print(f"Found {len(shingles)} unique shingles, out of {len(TEXT)} possible.")
    return shingles

In [9]:
getShingles(TEXT, K=5)

Found 3718 unique shingles, out of 5733 possible.


{'d gui',
 's add',
 'tal r',
 'ublic',
 'ershi',
 'nding',
 'ne it',
 'nded ',
 'eal b',
 'son f',
 'nt tr',
 ' rema',
 'egrit',
 'titud',
 'g cat',
 ' slao',
 'des 1',
 'ce al',
 'g lat',
 'ost n',
 'oena ',
 'eur f',
 'ce go',
 ' clea',
 ' law ',
 'first',
 'ajori',
 'est h',
 'contr',
 ' sile',
 ' liai',
 'e cop',
 's jak',
 'ple f',
 'ee in',
 'kly d',
 'd kin',
 'so re',
 'ine c',
 ' home',
 'ld un',
 'y req',
 'urces',
 'litic',
 ' and ',
 ' lati',
 'n spo',
 'ountr',
 'secut',
 'ed we',
 'st ga',
 'wed s',
 'n s a',
 'fican',
 'takes',
 'ned r',
 'unite',
 'virus',
 's dem',
 'pper ',
 ' prog',
 ' week',
 'evera',
 'benef',
 'chael',
 'expre',
 'am al',
 'rsday',
 ' amat',
 'ng up',
 ' foot',
 'cts a',
 'loy c',
 'g fee',
 ' woma',
 'ichae',
 'ocal ',
 'l int',
 'ter q',
 'ding ',
 'r wee',
 'looke',
 'd at ',
 'ghter',
 'd aut',
 'ine h',
 'ui he',
 'eside',
 'the l',
 'xtent',
 ' stat',
 'ne ca',
 'rmini',
 'lic f',
 'l pub',
 'servi',
 'nts d',
 'er li',
 'ntrys',
 'ted d',


In [12]:
def topMostCommonWords(WORDS, N):
    WORDS = filter(lambda x: len(x)>1, WORDS)
    return Counter(WORDS).most_common(N)

topMostCommonWords(WORDS, 5)

[('said', 13), ('trump', 12), ('vaccine', 11), ('public', 9), ('president', 7)]

## Part 4: Misleading Profile Section

In [None]:
# User_Movie = [
#     ('U1', 'M1'),
#     ('U1', 'M3'),
#     ('U1', 'M4'),
#     ('U1', 'M5'),
#     ('U2', 'M3'),
#     ('U2', 'M4')
# ]


# User_Genre = [
#     ('U1', 'Action'),
#     ('U1', 'Comedy'),
#     ('U2', 'Action')
# ]


# Movie_Genre = [
#     ('M1', 'Animation'),
#     ('M2', 'Adventure'),
#     ('M3', 'Comedy'),
#     ('M4', 'Comedy'),
#     ('M5', 'Comedy'),
#     ('M6', 'Action'),
#     ('M7', 'Comedy'),
#     ('M8', 'Adventure'),
#     ('M9', 'Action'),
#     ('M10', 'Action')
# ]

# UM = sc.parallelize(User_Movie)
# UG = sc.parallelize(User_Genre)
# MG = sc.parallelize(Movie_Genre)

In [13]:
UM = User_Movie = sc.textFile('./data/watchedmovies.txt') 
UG = User_Genre = sc.textFile('./data/preferences.txt')
MG = Movie_Genre =  sc.textFile('./data/movies.txt')

MG = Movie_Genre.map(lambda x: (x.split(',')[0], x.split(',')[2]) )
UG = User_Genre.map(lambda x: (x.split(',')[0], x.split(',')[1]) )
UM = User_Movie.map(lambda x: (x.split(',')[0], x.split(',')[1]) )

In [14]:
THRESHOLD = 2

In [15]:
MG.collect()

[('movie1', 'Animation'),
 ('movie2', 'Adventure'),
 ('movie3', 'Comedy'),
 ('movie4', 'Comedy'),
 ('movie5', 'Comedy'),
 ('movie6', 'Action'),
 ('movie7', 'Comedy'),
 ('movie8', 'Adventure'),
 ('movie9', 'Action'),
 ('movie10', 'Action')]

In [16]:
UG.collect()

[('user1', 'Animation'), ('user1', 'Comedy'), ('user2', 'Action')]

In [17]:
UM.collect()

[('user1', 'movie1'),
 ('user1', 'movie3'),
 ('user1', 'movie4'),
 ('user1', 'movie5'),
 ('user2', 'movie6'),
 ('user2', 'movie3'),
 ('user2', 'movie4')]

In [18]:
UM = UM.map(lambda x: (x[1],(x[0],x[1])) ).groupByKey().mapValues(lambda x: list(x))
UM.collect()

[('movie1', [('user1', 'movie1')]),
 ('movie4', [('user1', 'movie4'), ('user2', 'movie4')]),
 ('movie3', [('user1', 'movie3'), ('user2', 'movie3')]),
 ('movie5', [('user1', 'movie5')]),
 ('movie6', [('user2', 'movie6')])]

In [19]:
MG = MG.map(lambda x : (x[0], [x[1]]))
MG.collect()

[('movie1', ['Animation']),
 ('movie2', ['Adventure']),
 ('movie3', ['Comedy']),
 ('movie4', ['Comedy']),
 ('movie5', ['Comedy']),
 ('movie6', ['Action']),
 ('movie7', ['Comedy']),
 ('movie8', ['Adventure']),
 ('movie9', ['Action']),
 ('movie10', ['Action'])]

In [20]:
UM.union(MG).collect()

[('movie1', [('user1', 'movie1')]),
 ('movie4', [('user1', 'movie4'), ('user2', 'movie4')]),
 ('movie3', [('user1', 'movie3'), ('user2', 'movie3')]),
 ('movie5', [('user1', 'movie5')]),
 ('movie6', [('user2', 'movie6')]),
 ('movie1', ['Animation']),
 ('movie2', ['Adventure']),
 ('movie3', ['Comedy']),
 ('movie4', ['Comedy']),
 ('movie5', ['Comedy']),
 ('movie6', ['Action']),
 ('movie7', ['Comedy']),
 ('movie8', ['Adventure']),
 ('movie9', ['Action']),
 ('movie10', ['Action'])]

In [21]:
def replaceGenre(X, Y):
    newList = []
    if type(Y[0]) != str:
        C = Y
        Y = X
        X = C
    for t in X:
        newList.append((t[0], Y[0]))
    return newList

In [22]:
Joined = UM.union(MG).reduceByKey(lambda x,y: replaceGenre(x,y) )
Joined.collect()

[('movie8', ['Adventure']),
 ('movie5', [('user1', 'Comedy')]),
 ('movie9', ['Action']),
 ('movie1', [('user1', 'Animation')]),
 ('movie4', [('user1', 'Comedy'), ('user2', 'Comedy')]),
 ('movie10', ['Action']),
 ('movie3', [('user1', 'Comedy'), ('user2', 'Comedy')]),
 ('movie6', [('user2', 'Action')]),
 ('movie2', ['Adventure']),
 ('movie7', ['Comedy'])]

In [23]:
Joined = Joined.flatMapValues(lambda x: x)
Joined.collect()

[('movie8', 'Adventure'),
 ('movie5', ('user1', 'Comedy')),
 ('movie9', 'Action'),
 ('movie1', ('user1', 'Animation')),
 ('movie4', ('user1', 'Comedy')),
 ('movie4', ('user2', 'Comedy')),
 ('movie10', 'Action'),
 ('movie3', ('user1', 'Comedy')),
 ('movie3', ('user2', 'Comedy')),
 ('movie6', ('user2', 'Action')),
 ('movie2', 'Adventure'),
 ('movie7', 'Comedy')]

In [24]:
Joined = Joined.filter(lambda x: type(x[1]) != str )
Joined.collect()

[('movie5', ('user1', 'Comedy')),
 ('movie1', ('user1', 'Animation')),
 ('movie4', ('user1', 'Comedy')),
 ('movie4', ('user2', 'Comedy')),
 ('movie3', ('user1', 'Comedy')),
 ('movie3', ('user2', 'Comedy')),
 ('movie6', ('user2', 'Action'))]

In [25]:
Joined = Joined.map(lambda x: (x[1][0], (x[0],x[1][1]) ))
Joined.collect()

[('user1', ('movie5', 'Comedy')),
 ('user1', ('movie1', 'Animation')),
 ('user1', ('movie4', 'Comedy')),
 ('user2', ('movie4', 'Comedy')),
 ('user1', ('movie3', 'Comedy')),
 ('user2', ('movie3', 'Comedy')),
 ('user2', ('movie6', 'Action'))]

In [26]:
from collections import Counter

def getGenreCount(X):
    X = list(X)
    X = [t[1] for t in X]
    X = Counter(X).most_common()
    
#     COMMON_LIST = ['Comedy', 'Comedy', 'Action', 'Adventure', 'Adventure', 'Comedy']
#     [('Comedy', 3), ('Adventure', 2), ('Action', 1)]
    return X

In [27]:
Joined = Joined.groupByKey().mapValues(lambda x: getGenreCount(x))
Joined.collect()

[('user1', [('Comedy', 3), ('Animation', 1)]),
 ('user2', [('Comedy', 2), ('Action', 1)])]

In [28]:
UG = UG.map(lambda x : (x[0], [x[1]]))
UG.collect()

[('user1', ['Animation']), ('user1', ['Comedy']), ('user2', ['Action'])]

In [29]:
Joined.union(UG).collect()

[('user1', [('Comedy', 3), ('Animation', 1)]),
 ('user2', [('Comedy', 2), ('Action', 1)]),
 ('user1', ['Animation']),
 ('user1', ['Comedy']),
 ('user2', ['Action'])]

In [30]:
def removeKnownGenres(X, Y):
    newList = []
    if type(Y[0]) != str:
        C = Y
        Y = X
        X = C
    for t in X:
        if Y[0] != t[0]:
            newList.append(t)
    return newList

In [31]:
Joined = Joined.union(UG).reduceByKey(lambda x,y: removeKnownGenres(x,y) )
Joined.collect()

[('user2', [('Comedy', 2)]), ('user1', [('Comedy', 3)])]

In [32]:
Joined = Joined.filter(lambda x: list(filter(lambda y: y[1]>=THRESHOLD, x[1])) )
Joined.collect()

[('user2', [('Comedy', 2)]), ('user1', [('Comedy', 3)])]

### In One Function

In [33]:
# FUNCTIONS

def replaceGenre(X, Y):
    newList = []
    if type(Y[0]) != str:
        C = Y
        Y = X
        X = C
    for t in X:
        newList.append((t[0], Y[0]))
    return newList


from collections import Counter
def getGenreCount(X):
    X = list(X)
    X = [t[1] for t in X]
    X = Counter(X).most_common()
    
#     COMMON_LIST = ['Comedy', 'Comedy', 'Action', 'Adventure', 'Adventure', 'Comedy']
#     [('Comedy', 3), ('Adventure', 2), ('Action', 1)]
    return X


def removeKnownGenres(X, Y):
    newList = []
    if type(Y[0]) != str:
        C = Y
        Y = X
        X = C
    for t in X:
        if Y[0] != t[0]:
            newList.append(t)
    return newList

In [34]:
def MisleadingProfileSelection(THRESHOLD):
    User_Movie = sc.textFile('./data/watchedmovies.txt') 
    User_Genre = sc.textFile('./data/preferences.txt')
    Movie_Genre =  sc.textFile('./data/movies.txt')
    
    MG = Movie_Genre.map(lambda x: (x.split(',')[0], x.split(',')[2]) )
    UG = User_Genre.map(lambda x: (x.split(',')[0], x.split(',')[1]) )
    UM = User_Movie.map(lambda x: (x.split(',')[0], x.split(',')[1]) )
    
    UM = UM.map(lambda x: (x[1],(x[0],x[1])) ).groupByKey().mapValues(lambda x: list(x))
    MG = MG.map(lambda x : (x[0], [x[1]]))
    
    Joined = UM.union(MG).reduceByKey(lambda x,y: replaceGenre(x,y) )
    Joined = Joined.flatMapValues(lambda x: x)
    Joined = Joined.filter(lambda x: type(x[1]) != str )
    Joined = Joined.map(lambda x: (x[1][0], (x[0],x[1][1]) ))
    Joined = Joined.groupByKey().mapValues(lambda x: getGenreCount(x))
    
    UG = UG.map(lambda x : (x[0], [x[1]]))
    Joined = Joined.union(UG).reduceByKey(lambda x,y: removeKnownGenres(x,y) )
    Joined = Joined.filter(lambda x: list(filter(lambda y: y[1]>=THRESHOLD, x[1])) )
    print(Joined.collect())
    print(Joined.keys().collect())
    

THRESHOLD  = 2
MisleadingProfileSelection(THRESHOLD)

[('user2', [('Comedy', 2)]), ('user1', [('Comedy', 3)])]
['user2', 'user1']
