### Creating Spark session
Spark session is "the gateway to the structured data processing".
It can be used to create datasets, dataframes, user defined functions and execute SQL.
It replaces SQLContext used in previous versions of Apache Spark.

In [1]:
import sagemaker_pyspark
from pyspark.sql import SparkSession

# This enables s3 support in Spark. You may need to restart the kernel!
classpath = ":".join(sagemaker_pyspark.classpath_jars())

spark = SparkSession.builder \
    .master("local") \
    .appName("Teemuko mle") \
    .config("spark.driver.extraClassPath", classpath) \
    .getOrCreate()

### Loading the CSV-data from S3

In [2]:
# Movie ratings data
filePath = "s3a://sagemaker-tmukoo/ratings-5000.csv"
#filePath = "s3a://sagemaker-tmukoo/ratings.csv"


ratings = spark.read.load(filePath, format="csv", inferSchema="true", header="true")
#df=spark.read.csv(filePath,header=True)

In [3]:
# Movie metadata
filePath = "s3a://sagemaker-tmukoo/movies_metadata.csv"


movies = spark.read.load(filePath, format="csv", inferSchema="true", header="true")
#df=spark.read.csv(filePath,header=True)

## Data exploration

In [4]:
ratings.take(3)

[Row(userId=1, movieId=110, rating=1.0, timestamp=1425941529),
 Row(userId=1, movieId=147, rating=4.5, timestamp=1425942435),
 Row(userId=1, movieId=858, rating=5.0, timestamp=1425941523)]

In [85]:
# Find movieId:s and ratings for a specific user
def users_ratings_df(user):
    return ratings.filter(ratings["userId"]==user).select('movieId','rating')
#.collect()

users_ratings_df(2).collect()
#for movieId,rating in users_ratings(3):
#    print(movieId)

[Row(movieId=5, rating=3.0),
 Row(movieId=25, rating=3.0),
 Row(movieId=32, rating=2.0),
 Row(movieId=58, rating=3.0),
 Row(movieId=64, rating=4.0),
 Row(movieId=79, rating=4.0),
 Row(movieId=141, rating=3.0),
 Row(movieId=260, rating=4.0),
 Row(movieId=339, rating=5.0),
 Row(movieId=377, rating=4.0),
 Row(movieId=605, rating=4.0),
 Row(movieId=628, rating=4.0),
 Row(movieId=648, rating=4.0),
 Row(movieId=762, rating=3.0),
 Row(movieId=780, rating=3.0),
 Row(movieId=786, rating=1.0),
 Row(movieId=788, rating=1.0),
 Row(movieId=1210, rating=4.0),
 Row(movieId=1233, rating=4.0),
 Row(movieId=1356, rating=5.0),
 Row(movieId=1475, rating=3.0),
 Row(movieId=1552, rating=2.0)]

In [5]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [6]:
movies.printSchema()

root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nu

ratings.describe().show()

In [7]:
# How many distinct userIds?
ratings.select('userId').distinct().count()

49

In [8]:
# How many distinct movieIds?
ratings.select('movieId').distinct().count()

2347

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rc("font",size=15)
ratings.select('rating').toPandas().rating.sort_values().value_counts(sort=False).plot(kind='bar')
plt.title('Rating distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

<Figure size 640x480 with 1 Axes>

#### Overall statistics 
from pyspark.sql.functions import mean, min, max
ratings.select([mean('rating'), min('rating'), max('rating')]).show()

In [10]:
# Average rating per movie and rating counts
average_ratings=ratings.groupBy(ratings.movieId).agg({"movieId": "count", "rating": "avg"}).orderBy(["count(movieId)"],ascending=0)
average_ratings.join(movies, movies.id == ratings.movieId).select('movieId','title','avg(rating)','count(movieId)').show()


+-------+--------------------+------------------+--------------+
|movieId|               title|       avg(rating)|count(movieId)|
+-------+--------------------+------------------+--------------+
|   1408|    Cutthroat Island|               4.0|             2|
|    524|              Casino|              2.25|             2|
|      5|          Four Rooms|               3.5|             2|
|    902| {'name': 'Victoi...|             3.875|             4|
|     63|      Twelve Monkeys|               2.5|             1|
|   2054|  Mr. Holland's Opus|2.5833333333333335|             6|
|    880|      Antonia's Line|               2.5|             2|
|    568|           Apollo 13|               3.0|             1|
|   1873|      Beyond Rangoon|               2.5|             2|
|   3512|Under Siege 2: Da...|               3.0|             1|
|   1909|    Don Juan DeMarco|               4.0|             3|
|   4954|           Drop Zone|               3.0|             1|
|    628|Interview with t

### Starting to arrange data for analysis

In [11]:
# Create a movieId vs userId average rating pivot (using average if the movie has been rated twice) 
ratings_pivot=ratings.groupBy('userId')\
.pivot('movieId')\
.agg({"rating": "avg"})

In [12]:
#specific_film_ratings=ratings_pivot.select('858')
#from pyspark.sql import functions as F
#ratings_pivot.filter(specific_film_ratings._1.isNotNull)
ratings_pivot.filter(ratings_pivot['858'].isNotNull()).show()

+------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+--

### Machine learning to the rescue!
Note! SparkML will eventually replace MLlib. => don't use MLlib.

Example. ALS based CF: https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html

In [13]:
# DON'T USE! from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

#lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
#parts = lines.map(lambda row: row.value.split("::"))
#ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]), timestamp=long(p[3])))
#ratings = spark.createDataFrame(ratingsRDD)

(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

Root-mean-square error = 3.2850568169275345


In [14]:
# Tämä kai olisi kullekin käyttäjälle ALS:n suosittelemat leffat
userRecs.take(3)

[Row(userId=31, recommendations=[Row(movieId=1097, rating=7.6576385498046875), Row(movieId=1994, rating=5.209170818328857), Row(movieId=592, rating=5.200376510620117), Row(movieId=1968, rating=5.023256301879883), Row(movieId=2018, rating=5.000657081604004), Row(movieId=3755, rating=4.975311279296875), Row(movieId=1997, rating=4.782990455627441), Row(movieId=1645, rating=4.773164749145508), Row(movieId=4643, rating=4.739664554595947), Row(movieId=2160, rating=4.5597686767578125)]),
 Row(userId=34, recommendations=[Row(movieId=480, rating=8.529111862182617), Row(movieId=858, rating=7.1305742263793945), Row(movieId=1196, rating=5.777863025665283), Row(movieId=527, rating=5.33332633972168), Row(movieId=2959, rating=5.275406837463379), Row(movieId=296, rating=5.187756538391113), Row(movieId=1193, rating=5.1683349609375), Row(movieId=1250, rating=5.078431606292725), Row(movieId=3527, rating=5.064661026000977), Row(movieId=2804, rating=5.05409574508667)]),
 Row(userId=28, recommendations=[Row

In [15]:
movieRecs.take(3)

[Row(movieId=1580, recommendations=[Row(userId=43, rating=5.450592517852783), Row(userId=17, rating=5.267217636108398), Row(userId=46, rating=4.478673934936523), Row(userId=16, rating=4.385912895202637), Row(userId=7, rating=4.122302532196045), Row(userId=49, rating=4.041903972625732), Row(userId=20, rating=3.991048574447632), Row(userId=2, rating=3.975433826446533), Row(userId=13, rating=3.9050474166870117), Row(userId=5, rating=3.881096839904785)]),
 Row(movieId=471, recommendations=[Row(userId=24, rating=2.9986910820007324), Row(userId=12, rating=2.4576220512390137), Row(userId=23, rating=2.3288187980651855), Row(userId=26, rating=1.7785141468048096), Row(userId=27, rating=1.61484956741333), Row(userId=43, rating=1.558711051940918), Row(userId=44, rating=1.528580665588379), Row(userId=1, rating=1.4704351425170898), Row(userId=38, rating=1.4432333707809448), Row(userId=8, rating=1.4269739389419556)]),
 Row(movieId=1591, recommendations=[Row(userId=8, rating=4.005098342895508), Row(us

In [16]:
movies.select('id','title').filter("title like '%Toy Story'").show()

+---+---------+
| id|    title|
+---+---------+
|862|Toy Story|
+---+---------+



In [17]:
# Mitä tää data kertoo? Elokuvan perusteella lähinnä mun makua olevat käyttäjät?
# Onko tää joku Tinderin korvike?
movieRecs.filter("movieId = 1580").select('recommendations').collect()

[Row(recommendations=[Row(userId=43, rating=5.450592517852783), Row(userId=17, rating=5.267217636108398), Row(userId=46, rating=4.478673934936523), Row(userId=16, rating=4.385912895202637), Row(userId=7, rating=4.122302532196045), Row(userId=49, rating=4.041903972625732), Row(userId=20, rating=3.991048574447632), Row(userId=2, rating=3.975433826446533), Row(userId=13, rating=3.9050474166870117), Row(userId=5, rating=3.881096839904785)])]

### Palataan takaisin manuaaliseen recommendation enginen koodaukseen...

In [18]:
# Pearson correlation test
# Result... nulls are counted as zeros in Spark => crap results
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]

data = [(Vectors.dense([7,6,7,4,5,4]),),
        (Vectors.sparse(6, [(0,6), (1,7), (3,4), (4,3), (5,4) ]),),
        (Vectors.sparse(6, [(1,3),(2,3),(3,1),(4,1)]),),
        (Vectors.dense([1,2,2,3,3,4]),),
        (Vectors.sparse(6, [(0,1),(2,1),(3,2),(4,3),(5,3)]),),
        (Vectors.sparse(6, [(0,5),(1,4),(3,3),(5,4)]),)]

df = spark.createDataFrame(data, ["features"])

r1 = Correlation.corr(df, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))

r2 = Correlation.corr(df, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))

Pearson correlation matrix:
DenseMatrix([[ 1.        ,  0.84034307,  0.21809502,  0.87118712,  0.33952193,
               0.64952824],
             [ 0.84034307,  1.        ,  0.21521103,  0.70676177,  0.21997067,
               0.30621273],
             [ 0.21809502,  0.21521103,  1.        ,  0.14043594,  0.6240288 ,
              -0.10247663],
             [ 0.87118712,  0.70676177,  0.14043594,  1.        ,  0.53441734,
               0.87208494],
             [ 0.33952193,  0.21997067,  0.6240288 ,  0.53441734,  1.        ,
               0.38996632],
             [ 0.64952824,  0.30621273, -0.10247663,  0.87208494,  0.38996632,
               1.        ]])
Spearman correlation matrix:
DenseMatrix([[ 1.        ,  0.75370235, -0.08823529,  0.94040326,  0.52363494,
               0.77174363],
             [ 0.75370235,  1.        , -0.14494276,  0.76503685,  0.15179419,
               0.54089872],
             [-0.08823529, -0.14494276,  1.        , -0.11941629,  0.49283288,
       

In [130]:
# Testing spark sum aggregation for user's ratings
both_rated=[5,25,32]
person1_preferences_sum = users_ratings(2).filter(users_ratings(2).movieId.isin(both_rated))\
.agg({"rating": "sum"}).select('sum(rating)').collect()[0][0]
#.groupBy().power('rating',2)
#    
#    .agg({"movieId": "sum"})
print(person1_preferences_sum)

8.0


In [140]:
person1=2
person1_square_preferences_sum = sum(pow(users_ratings(person1).\
filter(users_ratings(person1).movieId.isin(both_rated)).select('rating'),2))
print(person1_square_preferences_sum)

TypeError: unsupported operand type(s) for ** or pow(): 'DataFrame' and 'int'

In [52]:
def pearson_correlation(person1,person2):
 
    # To get both rated items
    both_rated = {}
    #for item in dataset[person1]:
    #    if item in dataset[person2]:
    #        both_rated[item] = 1
 
    for movieId,rating in users_ratings(person1).collect():
        if movieId in users_ratings(person2).collect():
            both_rated[movieId] = 1
            
    number_of_ratings = len(both_rated)
    
    # Checking for number of ratings in common
    if number_of_ratings == 0:
        return 0
 
    # Add up all the preferences of each user
#    person1_preferences_sum = sum([dataset[person1][item] for item in both_rated])
#    person2_preferences_sum = sum([dataset[person2][item] for item in both_rated])

    person1_preferences_sum = users_ratings(person1).\
    filter(users_ratings(person1).movieId.isin(both_rated))\
    .agg({"rating": "sum","rating": "pow"}).select('sum(rating)').collect()[0][0]
    person2_preferences_sum = users_ratings(person2).\
    filter(users_ratings(person2).movieId.isin(both_rated))\
    .agg({"rating": "sum"}).select('sum(rating)').collect()[0][0]
    
    
    # Sum up the squares of preferences of each user
#    person1_square_preferences_sum = sum([pow(dataset[person1][item],2) for item in both_rated])
#    person2_square_preferences_sum = sum([pow(dataset[person2][item],2) for item in both_rated])

    person1_square_preferences_sum = sum(users_ratings(person1).\
    filter(users_ratings(person1).movieId.isin(both_rated)).select('rating'),2)
 
 
    # Sum up the product value of both preferences for each item
    product_sum_of_both_users = sum([dataset[person1][item] * dataset[person2][item] for item in both_rated])
 
    # Calculate the pearson score
    numerator_value = product_sum_of_both_users - (person1_preferences_sum*person2_preferences_sum/number_of_ratings)
    denominator_value = sqrt((person1_square_preferences_sum - pow(person1_preferences_sum,2)/number_of_ratings) * (person2_square_preferences_sum -pow(person2_preferences_sum,2)/number_of_ratings))
    if denominator_value == 0:
        return 0
    else:
        r = numerator_value/denominator_value
        return r
    
dataset=data
for item in dataset[1]:
    print(item)
#print(pearson_correlation(1,2))
#print pearson_correlation('Lisa Rose','Gene Seymour')

(6,[0,1,3,4,5],[6.0,7.0,4.0,3.0,4.0])


In [19]:
#From https://blog.epigno.systems/2018/02/21/machine-learning-with-pyspark-feature-selection/
#prepare the data
features = ["temperature", "exhaust_vacuum", "ambient_pressure", "relative_humidity"]
lr_data = data.select(col("energy_output").alias("label"), *features).dropna()

vector = VectorAssembler(inputCols=columns, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

#stages = [vector, scaler]
stages = [scaler]

pipe = Pipeline(stages=stages)

# we'll be using this data frame
data_for_correlation = pipe.fit(lr_data).transform(lr_data).select("scaled_features")


#The correlation step
correlation = Correlation.corr(data_for_correlation, "scaled_features", "pearson").collect()[0][0].toArray()

 # rename _1, _2 ... columns to their original name
df = pd.DataFrame(correlation)
df["features"] = pd.Series(columns)

 # let's see the results
display(spark.createDataFrame(df, schema=columns))

AttributeError: 'list' object has no attribute 'select'

import pandas as pd
spark.createDataFrame(data).toPandas()

In [None]:
# count mean ratings


pearsonCorr = Correlation.corr(df, "features").collect()[0][0]
print(str(pearsonCorr).replace('nan', 'NaN'))

#import numpy as np
#np.average((Vectors.sparse(4, [(0, 2.0), (3, 2.0)]),))
#neo=spark.createDataFrame([(Vectors.sparse(4, [(0, 2.0), (3, 2.0)]),)], ["rating"])

from pyspark.sql.functions import mean, min, max
#neo.select([mean('rating'), min('rating'), max('rating')]).show()

df[0]