In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("RDD").getOrCreate()

In [3]:
sc = spark.sparkContext

In [4]:
#To retrieve the location where movies.txt is stored in the computer location
input_path = (r"C:\Users\C940\spark3\movies.txt")
input_path

'C:\\Users\\C940\\spark3\\movies.txt'

In [5]:
#To read the data in the textfile
records =sc.textFile(input_path)

In [6]:
#To make sure count of records is correct which = 20
records.count()

20

In [7]:
#To retrieve all the elements of the dataset
records.collect()

['user9,m1,5',
 'user8,m2,4',
 'user1,m1,2',
 'user1,9',
 'user1,m1,2',
 'user2,m2,3',
 'user2,m3,5',
 'user3,m3,4',
 'user6,m3,4',
 'user7,m3,3',
 'user3,king',
 'user4,m1,3',
 'user5,m2,5',
 'user6,m4,5',
 'user7,m5,5',
 'user1',
 'user3,m3,5',
 'user4,m4,1',
 'user5,m2,4',
 'user6,m4,4']

In [8]:
# Create (movieid, rating) pairs by tokenizing and splitting records
def tokenize(record):
    tokens = record.split(",")
    movieid = tokens[0]
    movieid = tokens[1]
    rating = int(tokens[2])
    return (movieid, rating)

In [9]:
# Test tokenize() function
result1 = tokenize('user9,m1,5')
result1

('m1', 5)

In [10]:
# Drop records if there are less than 3 tokens
CleanedRecords = records.filter(lambda x: (len(x.split(",")) >=3))

In [11]:
#To retrieve all the elements of the records after filtering out those records that have less than 3 tokens
CleanedRecords.collect()

['user9,m1,5',
 'user8,m2,4',
 'user1,m1,2',
 'user1,m1,2',
 'user2,m2,3',
 'user2,m3,5',
 'user3,m3,4',
 'user6,m3,4',
 'user7,m3,3',
 'user4,m1,3',
 'user5,m2,5',
 'user6,m4,5',
 'user7,m5,5',
 'user3,m3,5',
 'user4,m4,1',
 'user5,m2,4',
 'user6,m4,4']

In [12]:
#To tokenized records of the cleaned records such that movieID and rating is retrieved
pairs = CleanedRecords.map(tokenize)

In [13]:
#To retrieve all the elements of the records after tokenizing records
pairs.collect()

[('m1', 5),
 ('m2', 4),
 ('m1', 2),
 ('m1', 2),
 ('m2', 3),
 ('m3', 5),
 ('m3', 4),
 ('m3', 4),
 ('m3', 3),
 ('m1', 3),
 ('m2', 5),
 ('m4', 5),
 ('m5', 5),
 ('m3', 5),
 ('m4', 1),
 ('m2', 4),
 ('m4', 4)]

In [14]:
# Drop records if movie rating is less than 2
filteredRating = pairs.filter(lambda x : x[1] >=2)

In [15]:
#To retrieve all the elements of the records after filtering out movie_ids with rating less than 2
filteredRating.collect()

[('m1', 5),
 ('m2', 4),
 ('m1', 2),
 ('m1', 2),
 ('m2', 3),
 ('m3', 5),
 ('m3', 4),
 ('m3', 4),
 ('m3', 3),
 ('m1', 3),
 ('m2', 5),
 ('m4', 5),
 ('m5', 5),
 ('m3', 5),
 ('m2', 4),
 ('m4', 4)]

In [16]:
#A function to do as follows:
#If rating is 5 then to return "unique-5, movie id" as well as movie id and rating
#If rating is less than 5, to return movie id and rating
def flatten(keyvalue):
    movieid = keyvalue[0]
    rating = keyvalue[1]
    if rating == 5:
        return [(movieid, rating),("unique-5", movieid)]
    else:
        return [(movieid, rating)]

In [17]:
#To apply the flatten function onto the filtered rating data
flattened = filteredRating.flatMap(flatten)

In [18]:
#To retrieve the elements where flatten function has been applied to filtered rating data
flattened.collect()

[('m1', 5),
 ('unique-5', 'm1'),
 ('m2', 4),
 ('m1', 2),
 ('m1', 2),
 ('m2', 3),
 ('m3', 5),
 ('unique-5', 'm3'),
 ('m3', 4),
 ('m3', 4),
 ('m3', 3),
 ('m1', 3),
 ('m2', 5),
 ('unique-5', 'm2'),
 ('m4', 5),
 ('unique-5', 'm4'),
 ('m5', 5),
 ('unique-5', 'm5'),
 ('m3', 5),
 ('unique-5', 'm3'),
 ('m2', 4),
 ('m4', 4)]

In [19]:
# To group the keys together with its list values
group = flattened.groupByKey().mapValues(lambda values: list(values))

In [20]:
#To retrieve the elements where keys are grouped together with its values
group.collect()

[('m1', [5, 2, 2, 3]),
 ('unique-5', ['m1', 'm3', 'm2', 'm4', 'm5', 'm3']),
 ('m2', [4, 3, 5, 4]),
 ('m3', [5, 4, 4, 3, 5]),
 ('m4', [5, 4]),
 ('m5', [5])]

In [21]:
# To create reduction_finder function to do as follows:
# if key is 'unique-5' then to return the key and its values
# if key is not 'unique-5', then to return the key and the average rating
def reduction_finder(keyvalue):
    key = keyvalue[0]
    values = keyvalue[1]
    if (key == 'unique-5'):
        return(key, set(values))
    else:
        avg = sum(values) / len(values)
        if (avg >=2.5):
            return(key, avg)

In [22]:
#To apply reduction_finder function to the grouped data
result = group.map(reduction_finder)

In [23]:
#To retrieve the final result/output of the data
result.collect()

[('m1', 3.0),
 ('unique-5', {'m1', 'm2', 'm3', 'm4', 'm5'}),
 ('m2', 4.0),
 ('m3', 4.2),
 ('m4', 4.5),
 ('m5', 5.0)]

In [30]:
from math import sqrt
 
def isPrime(n):
 
    # Corner case
    if (n <= 1):
        return False
 
    # Check from 2 to sqrt(n)
    for i in range(2, int(sqrt(n))+1):
        if (n % i == 0):
            return (n+1)
 
    return n, (n+2)
 
 


In [33]:
isPrime(8)

9

TypeError: unsupported operand type(s) for %: 'list' and 'int'