### Creating Spark session
Spark session is "the gateway to the structured data processing".
It can be used to create datasets, dataframes, user defined functions and execute SQL.
It replaces SQLContext used in previous versions of Apache Spark.

In [1]:
import sagemaker_pyspark
from pyspark.sql import SparkSession

# This enables s3 support in Spark. You may need to restart the kernel!
classpath = ":".join(sagemaker_pyspark.classpath_jars())

spark = SparkSession.builder \
    .master("local") \
    .appName("Teemuko mle") \
    .config("spark.driver.extraClassPath", classpath) \
    .getOrCreate()

### Loading the CSV-data from S3

In [2]:
# Movie ratings data
filePath = "s3a://sagemaker-tmukoo/ratings-5000.csv"
#filePath = "s3a://sagemaker-tmukoo/ratings.csv"


ratings = spark.read.load(filePath, format="csv", inferSchema="true", header="true")
#df=spark.read.csv(filePath,header=True)

In [3]:
# Movie metadata
filePath = "s3a://sagemaker-tmukoo/movies_metadata.csv"


movies = spark.read.load(filePath, format="csv", inferSchema="true", header="true")
#df=spark.read.csv(filePath,header=True)

## Data exploration

In [4]:
ratings.take(3)

[Row(userId=1, movieId=110, rating=1.0, timestamp=1425941529),
 Row(userId=1, movieId=147, rating=4.5, timestamp=1425942435),
 Row(userId=1, movieId=858, rating=5.0, timestamp=1425941523)]

In [5]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [6]:
movies.printSchema()

root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nu

ratings.describe().show()

In [7]:
# How many distinct userIds?
ratings.select('userId').distinct().count()

49

In [8]:
# How many distinct movieIds?
ratings.select('movieId').distinct().count()

2347

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rc("font",size=15)
ratings.select('rating').toPandas().rating.sort_values().value_counts(sort=False).plot(kind='bar')
plt.title('Rating distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

<Figure size 640x480 with 1 Axes>

#### Overall statistics 
from pyspark.sql.functions import mean, min, max
ratings.select([mean('rating'), min('rating'), max('rating')]).show()

In [10]:
# Average rating per movie and rating counts
average_ratings=ratings.groupBy(ratings.movieId).agg({"movieId": "count", "rating": "avg"}).orderBy(["count(movieId)"],ascending=0)
average_ratings.join(movies, movies.id == ratings.movieId).select('movieId','title','avg(rating)','count(movieId)').show()


+-------+--------------------+------------------+--------------+
|movieId|               title|       avg(rating)|count(movieId)|
+-------+--------------------+------------------+--------------+
|   1408|    Cutthroat Island|               4.0|             2|
|    524|              Casino|              2.25|             2|
|      5|          Four Rooms|               3.5|             2|
|    902| {'name': 'Victoi...|             3.875|             4|
|     63|      Twelve Monkeys|               2.5|             1|
|   2054|  Mr. Holland's Opus|2.5833333333333335|             6|
|    880|      Antonia's Line|               2.5|             2|
|    568|           Apollo 13|               3.0|             1|
|   1873|      Beyond Rangoon|               2.5|             2|
|   3512|Under Siege 2: Da...|               3.0|             1|
|   1909|    Don Juan DeMarco|               4.0|             3|
|   4954|           Drop Zone|               3.0|             1|
|    628|Interview with t

### Define helper functions

In [11]:
# Find movieId:s and ratings for a specific user
def users_ratings_df(user):
    return ratings.filter(ratings["userId"]==user).select('movieId','rating')
#.collect()

users_ratings_df(2).collect()
#for movieId,rating in users_ratings(3):
#    print(movieId)

[Row(movieId=5, rating=3.0),
 Row(movieId=25, rating=3.0),
 Row(movieId=32, rating=2.0),
 Row(movieId=58, rating=3.0),
 Row(movieId=64, rating=4.0),
 Row(movieId=79, rating=4.0),
 Row(movieId=141, rating=3.0),
 Row(movieId=260, rating=4.0),
 Row(movieId=339, rating=5.0),
 Row(movieId=377, rating=4.0),
 Row(movieId=605, rating=4.0),
 Row(movieId=628, rating=4.0),
 Row(movieId=648, rating=4.0),
 Row(movieId=762, rating=3.0),
 Row(movieId=780, rating=3.0),
 Row(movieId=786, rating=1.0),
 Row(movieId=788, rating=1.0),
 Row(movieId=1210, rating=4.0),
 Row(movieId=1233, rating=4.0),
 Row(movieId=1356, rating=5.0),
 Row(movieId=1475, rating=3.0),
 Row(movieId=1552, rating=2.0)]

### Pivoting data for analysis

# Create a movieId vs userId average rating pivot (using average if the movie has been rated twice) 
ratings_pivot=ratings.groupBy('userId')\
.pivot('movieId')\
.agg({"rating": "avg"})

#specific_film_ratings=ratings_pivot.select('858')
#from pyspark.sql import functions as F
#ratings_pivot.filter(specific_film_ratings._1.isNotNull)
ratings_pivot.filter(ratings_pivot['858'].isNotNull()).show()

# Machine learning to the rescue!
Note! SparkML will eventually replace MLlib. => don't use MLlib.

## ALS based Collaborative Filtering
https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html

In [12]:
# DON'T USE! from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

#lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
#parts = lines.map(lambda row: row.value.split("::"))
#ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]), timestamp=long(p[3])))
#ratings = spark.createDataFrame(ratingsRDD)

(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

Root-mean-square error = 2.8625903877937433


In [13]:
# Tämä kai olisi kullekin käyttäjälle ALS:n suosittelemat leffat
userRecs.take(3)

[Row(userId=31, recommendations=[Row(movieId=2018, rating=4.999802589416504), Row(movieId=3755, rating=4.981315612792969), Row(movieId=1377, rating=4.500730037689209), Row(movieId=1358, rating=4.5001749992370605), Row(movieId=2006, rating=4.499944686889648), Row(movieId=2087, rating=4.499881267547607), Row(movieId=520, rating=3.9991908073425293), Row(movieId=2109, rating=3.1323153972625732), Row(movieId=168252, rating=3.102997303009033), Row(movieId=2294, rating=2.9996719360351562)]),
 Row(userId=34, recommendations=[Row(movieId=102125, rating=6.458173751831055), Row(movieId=2804, rating=5.75777530670166), Row(movieId=593, rating=5.495298385620117), Row(movieId=7153, rating=5.1707329750061035), Row(movieId=1196, rating=5.09976053237915), Row(movieId=60069, rating=5.090648651123047), Row(movieId=168252, rating=5.069170951843262), Row(movieId=1250, rating=5.0691328048706055), Row(movieId=44555, rating=5.056756496429443), Row(movieId=3471, rating=5.056451797485352)]),
 Row(userId=28, reco

In [14]:
movieRecs.take(3)

[Row(movieId=1580, recommendations=[Row(userId=21, rating=7.619421005249023), Row(userId=25, rating=5.521111488342285), Row(userId=36, rating=5.4626593589782715), Row(userId=33, rating=5.411818981170654), Row(userId=16, rating=4.793625354766846), Row(userId=7, rating=4.692002773284912), Row(userId=46, rating=4.632454872131348), Row(userId=41, rating=4.475106239318848), Row(userId=40, rating=4.354674339294434), Row(userId=20, rating=4.208553791046143)]),
 Row(movieId=471, recommendations=[Row(userId=28, rating=4.085762977600098), Row(userId=24, rating=2.9981555938720703), Row(userId=7, rating=2.2341885566711426), Row(userId=8, rating=2.050200939178467), Row(userId=3, rating=1.9684978723526), Row(userId=34, rating=1.7562252283096313), Row(userId=6, rating=1.5029963254928589), Row(userId=35, rating=1.451230764389038), Row(userId=47, rating=1.4340031147003174), Row(userId=44, rating=1.3760236501693726)]),
 Row(movieId=1591, recommendations=[Row(userId=8, rating=4.002291679382324), Row(user

In [15]:
movies.select('id','title').filter("title like '%Toy Story'").show()

+---+---------+
| id|    title|
+---+---------+
|862|Toy Story|
+---+---------+



In [16]:
# Mitä tää data kertoo? Elokuvan perusteella lähinnä mun makua olevat käyttäjät?
# Onko tää joku Tinderin korvike?
movieRecs.filter("movieId = 1580").select('recommendations').collect()

[Row(recommendations=[Row(userId=21, rating=7.619421005249023), Row(userId=25, rating=5.521111488342285), Row(userId=36, rating=5.4626593589782715), Row(userId=33, rating=5.411818981170654), Row(userId=16, rating=4.793625354766846), Row(userId=7, rating=4.692002773284912), Row(userId=46, rating=4.632454872131348), Row(userId=41, rating=4.475106239318848), Row(userId=40, rating=4.354674339294434), Row(userId=20, rating=4.208553791046143)])]

# Palataan takaisin manuaaliseen recommendation enginen koodaukseen...

# Pearson correlation test
# Result... nulls are counted as zeros in Spark => crap results
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]

data = [(Vectors.dense([7,6,7,4,5,4]),),
        (Vectors.sparse(6, [(0,6), (1,7), (3,4), (4,3), (5,4) ]),),
        (Vectors.sparse(6, [(1,3),(2,3),(3,1),(4,1)]),),
        (Vectors.dense([1,2,2,3,3,4]),),
        (Vectors.sparse(6, [(0,1),(2,1),(3,2),(4,3),(5,3)]),),
        (Vectors.sparse(6, [(0,5),(1,4),(3,3),(5,4)]),)]

df = spark.createDataFrame(data, ["features"])

r1 = Correlation.corr(df, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))

r2 = Correlation.corr(df, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))

### Pandas UDF

In [17]:
# This installs pyarrow to the conda_python3 kernel environment
# conda install --yes --name python3 --channel conda-forge pyarrow
#from pyspark.sql.functions import pandas_udf, PandasUDFType

# Use pandas_udf to define a Pandas UDF
#@pandas_udf('double', PandasUDFType.SCALAR)
# Input/output are both a pandas.Series of doubles

#def pandas_plus_one(v):
#    return v + 1

#@pandas_udf('double', PandasUDFType.SCALAR)
# Input/output are both a pandas.Series of doubles

#def pandas_power2(v):
#    return v*v

#df.withColumn('v2', pandas_plus_one(df.v))

person1=2
person1_square_preferences_sum = sum(\
                                     users_ratings_df(person1).\
filter(users_ratings_df(person1).movieId.isin(both_rated)).select('rating')*\
                                      users_ratings_df(person1).\
filter(users_ratings_df(person1).movieId.isin(both_rated)).select('rating')\
                                      )

# Pearson correlation calculation with Spark join

In [23]:
# Method to find common MovieIds
from pyspark.sql.functions import *
from math import sqrt 
person1_ratings = users_ratings_df(34) #.alias('df1')
person2_ratings = users_ratings_df(2) #.alias('df2')

#common_ratings = person1_ratings.withColumnRenamed("rating", "rating1").join(person2_ratings.withColumnRenamed("rating", "rating2"), \

# Join the ratings first, so that columns can be referred in calculations
common_ratings = person1_ratings.toDF('movieId','rating1').join(person2_ratings.toDF('movieId','rating2'), "movieId")

common_ratings_calc = common_ratings.select(common_ratings.movieId,common_ratings.rating1,common_ratings.rating2,\
                                            (pow(common_ratings.rating1,2)).alias('rating1^2'),\
                                            (pow(common_ratings.rating2,2)).alias('rating2^2'),\
                                            (common_ratings.rating1*common_ratings.rating2).alias('ratings_product')\
                                           )                                                                
common_ratings.show()
common_ratings_calc.show()

+-------+-------+-------+
|movieId|rating1|rating2|
+-------+-------+-------+
|     32|    5.0|    2.0|
|    260|    5.0|    4.0|
|    377|    3.0|    4.0|
|    648|    3.0|    4.0|
|    780|    5.0|    3.0|
|   1210|    5.0|    4.0|
|   1552|    2.0|    2.0|
+-------+-------+-------+

+-------+-------+-------+---------+---------+---------------+
|movieId|rating1|rating2|rating1^2|rating2^2|ratings_product|
+-------+-------+-------+---------+---------+---------------+
|     32|    5.0|    2.0|     25.0|      4.0|           10.0|
|    260|    5.0|    4.0|     25.0|     16.0|           20.0|
|    377|    3.0|    4.0|      9.0|     16.0|           12.0|
|    648|    3.0|    4.0|      9.0|     16.0|           12.0|
|    780|    5.0|    3.0|     25.0|      9.0|           15.0|
|   1210|    5.0|    4.0|     25.0|     16.0|           20.0|
|   1552|    2.0|    2.0|      4.0|      4.0|            4.0|
+-------+-------+-------+---------+---------+---------------+



In [18]:
from pyspark.sql.functions import *
from math import sqrt,pow 

def pearson_correlation(person1,person2):

##### To get both rated items
#    both_rated = {}
    #for item in dataset[person1]:
    #    if item in dataset[person2]:
    #        both_rated[item] = 1
 
#    for movieId,rating in users_ratings_df(person1).collect():
#        if movieId in users_ratings_df(person2).collect():
#            both_rated[movieId] = 1

#    person1_ratings = users_ratings_df(34).alias('df1')
#    person2_ratings = users_ratings_df(2).alias('df2')
#    both_rated=person1_ratings.join(person2_ratings, df1.movieId == df2.movieId)\
#    .select('df1.*').select('movieId').rdd.flatMap(lambda x: x).collect()
    
#    return both_rated
###########################


#    number_of_ratings = len(both_rated)
    
    # Checking for number of ratings in common
#    if number_of_ratings == 0:
#        return 0
 
    # Add up all the preferences of each user
#    person1_preferences_sum = sum([dataset[person1][item] for item in both_rated])
#    person2_preferences_sum = sum([dataset[person2][item] for item in both_rated])

#    person1_preferences_sum = users_ratings_df(person1).\
#    filter(users_ratings_df(person1).movieId.isin(both_rated))\
#    .agg({"rating": "sum","rating": "avg"}).select('sum(rating)').collect()[0][0]
        
#    person2_preferences_sum = users_ratings(person2).\
#    filter(users_ratings_df(person2).movieId.isin(both_rated))\
#    .agg({"rating": "sum"}).select('sum(rating)').collect()[0][0]
    
    
    # Sum up the squares of preferences of each user
#    person1_square_preferences_sum = sum([pow(dataset[person1][item],2) for item in both_rated])
#    person2_square_preferences_sum = sum([pow(dataset[person2][item],2) for item in both_rated])

#    person1_square_preferences_sum = sum(users_ratings_df(person1).\
#    filter(users_ratings_df(person1).movieId.isin(both_rated)).select('rating'),2)
 
 
    # Sum up the product value of both preferences for each item
#    product_sum_of_both_users = sum([dataset[person1][item] * dataset[person2][item] for item in both_rated])

    
    # Spark way of calculating all the above:
    # Method to find common MovieIds
    
    person1_ratings = users_ratings_df(person1) #.alias('df1')
    person2_ratings = users_ratings_df(person2) #.alias('df2')

    # Join the ratings first, so that columns can be referred in calculations
    common_ratings = person1_ratings.toDF('movieId','rating1').join(person2_ratings.toDF('movieId','rating2'), "movieId")

    common_ratings_calc = common_ratings.select(common_ratings.movieId,common_ratings.rating1,common_ratings.rating2,\
                                            (common_ratings.rating1*common_ratings.rating1).alias('rating1^2'),\
                                            (common_ratings.rating2*common_ratings.rating2).alias('rating2^2'),\
                                            (common_ratings.rating1*common_ratings.rating2).alias('ratings_product')\
                                           )
    
    common_ratings_agg = common_ratings_calc.agg({\
                                             "movieId":"count",\
                                             "rating1": "sum",\
                                             "rating2": "sum",\
                                             "rating1^2": "sum",\
                                             "rating2^2": "sum",\
                                             "ratings_product": "sum"}).collect()[0]
    
    # Unpacking the numebers from the named tuple:
    (number_of_ratings,\
     person1_preferences_sum,\
     person2_preferences_sum,\
     person1_square_preferences_sum,\
     person2_square_preferences_sum,\
     product_sum_of_both_users) = \
    (common_ratings_agg["count(movieId)"],\
     common_ratings_agg["sum(rating1)"],\
     common_ratings_agg["sum(rating2)"],\
     common_ratings_agg["sum(rating1^2)"],\
     common_ratings_agg["sum(rating2^2)"],\
     common_ratings_agg["sum(ratings_product)"],\
    )
    
    # Checking for number of common ratings
    if number_of_ratings == 0:
        return 0

    # Calculate the pearson score
    numerator_value = product_sum_of_both_users - (person1_preferences_sum*person2_preferences_sum/number_of_ratings)
    denominator_value = sqrt((person1_square_preferences_sum - pow(person1_preferences_sum,2)/number_of_ratings) * (person2_square_preferences_sum -pow(person2_preferences_sum,2)/number_of_ratings))
    if denominator_value == 0:
        return 0
    else:
        r = numerator_value/denominator_value
        return r
    
print(pearson_correlation(2,34))


0.1357241785076592


In [22]:
#From https://blog.epigno.systems/2018/02/21/machine-learning-with-pyspark-feature-selection/
#prepare the data
features = ["temperature", "exhaust_vacuum", "ambient_pressure", "relative_humidity"]
lr_data = data.select(col("energy_output").alias("label"), *features).dropna()

vector = VectorAssembler(inputCols=columns, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

#stages = [vector, scaler]
stages = [scaler]

pipe = Pipeline(stages=stages)

# we'll be using this data frame
data_for_correlation = pipe.fit(lr_data).transform(lr_data).select("scaled_features")


#The correlation step
correlation = Correlation.corr(data_for_correlation, "scaled_features", "pearson").collect()[0][0].toArray()

 # rename _1, _2 ... columns to their original name
df = pd.DataFrame(correlation)
df["features"] = pd.Series(columns)

 # let's see the results
display(spark.createDataFrame(df, schema=columns))

NameError: name 'data' is not defined

pearsonCorr = Correlation.corr(df, "features").collect()[0][0]
print(str(pearsonCorr).replace('nan', 'NaN'))