# Preprocessing MovieLens Data

In [1]:
import os
DATA_DIR = '../resources/data/ml-100k'
user_path = os.path.abspath(DATA_DIR + '/u.user')
item_path = os.path.abspath(DATA_DIR + '/u.item')
rating_path = os.path.abspath(DATA_DIR + '/u.data')

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, DoubleType, TimestampType
user_schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("age", IntegerType(), True),
    StructField("gender", StringType(), True),
    StructField("occupation", StringType(), True),
    StructField("zipCode", StringType(), True)])

item_schema = StructType([
    StructField("itemId", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("releaseDate", StringType(), True),
    StructField("videoReleaseDate", StringType(), True),
    StructField("imbdUrl", StringType(), True),
    StructField("unknown", IntegerType(), True),
    StructField("action", IntegerType(), True),
    StructField("adventure", IntegerType(), True),
    StructField("animation", IntegerType(), True),
    StructField("children", IntegerType(), True),
    StructField("comedy", IntegerType(), True),
    StructField("crime", IntegerType(), True),
    StructField("documentary", IntegerType(), True),
    StructField("drama", IntegerType(), True),
    StructField("fantasy", IntegerType(), True),
    StructField("noir", IntegerType(), True),
    StructField("horror", IntegerType(), True),
    StructField("musical", IntegerType(), True),
    StructField("mystery", IntegerType(), True),
    StructField("romance", IntegerType(), True),
    StructField("sciFi", IntegerType(), True),
    StructField("thriller", IntegerType(), True),
    StructField("war", IntegerType(), True),
    StructField("western", StringType(), True)])

rating_schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("itemId", IntegerType(), True),
    StructField("rating", DoubleType(), True),
    StructField("timestamp", LongType(), True)])

In [3]:
userDf = spark.read.option("delimiter","|").csv('file://' + user_path, schema = user_schema)
itemDf = spark.read.option("delimiter","|").csv('file://' + item_path, schema = item_schema)
ratingDf = spark.read.option("delimiter","\t").csv('file://' + rating_path, schema = rating_schema)

## One hot encoding

In [4]:
from pyspark.sql import functions as f
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# Change categorical values to numerical values first
stringIndexer = StringIndexer(inputCol="occupation", outputCol="occupationIdx")
model = stringIndexer.fit(userDf)
indexedDf = model.transform(userDf)

# Create one hot encoding
encoder = OneHotEncoder(inputCol="occupationIdx", outputCol="occupationVec")
encodedDf = encoder.transform(indexedDf)
encodedDf.show()

+------+---+------+-------------+-------+-------------+---------------+
|userId|age|gender|   occupation|zipCode|occupationIdx|  occupationVec|
+------+---+------+-------------+-------+-------------+---------------+
|     1| 24|     M|   technician|  85711|         11.0|(20,[11],[1.0])|
|     2| 53|     F|        other|  94043|          1.0| (20,[1],[1.0])|
|     3| 23|     M|       writer|  32067|          7.0| (20,[7],[1.0])|
|     4| 24|     M|   technician|  43537|         11.0|(20,[11],[1.0])|
|     5| 33|     F|        other|  15213|          1.0| (20,[1],[1.0])|
|     6| 42|     M|    executive|  98101|          8.0| (20,[8],[1.0])|
|     7| 57|     M|administrator|  91344|          3.0| (20,[3],[1.0])|
|     8| 36|     M|administrator|  05201|          3.0| (20,[3],[1.0])|
|     9| 29|     M|      student|  01002|          0.0| (20,[0],[1.0])|
|    10| 53|     M|       lawyer|  90703|         17.0|(20,[17],[1.0])|
|    11| 39|     F|        other|  30329|          1.0| (20,[1],

## Regex Extraction using Python

In [5]:
import numpy as np
def extract_title(raw):
    import re
    # this regular expression finds the non-word (numbers) between parentheses
    grps = re.search("\((\w+)\)", raw)
    return raw[:grps.start()].strip() if grps else raw

movie_titles = [extract_title(x[0]) for x in np.array(itemDf.select("title").collect())]

## With PySpark

In [6]:
itemDf.withColumn("title_text", f.regexp_replace('title', '\s\((\w+)\)', '')).select('title', 'title_text').show()

+--------------------+--------------------+
|               title|          title_text|
+--------------------+--------------------+
|    Toy Story (1995)|           Toy Story|
|    GoldenEye (1995)|           GoldenEye|
|   Four Rooms (1995)|          Four Rooms|
|   Get Shorty (1995)|          Get Shorty|
|      Copycat (1995)|             Copycat|
|Shanghai Triad (Y...|Shanghai Triad (Y...|
|Twelve Monkeys (1...|      Twelve Monkeys|
|         Babe (1995)|                Babe|
|Dead Man Walking ...|    Dead Man Walking|
|  Richard III (1995)|         Richard III|
|Seven (Se7en) (1995)|               Seven|
|Usual Suspects, T...| Usual Suspects, The|
|Mighty Aphrodite ...|    Mighty Aphrodite|
|  Postino, Il (1994)|         Postino, Il|
|Mr. Holland's Opu...|  Mr. Holland's Opus|
|French Twist (Gaz...|French Twist (Gaz...|
|From Dusk Till Da...| From Dusk Till Dawn|
|White Balloon, Th...|  White Balloon, The|
|Antonia's Line (1...|      Antonia's Line|
|Angels and Insect...|  Angels a

## One hot encoding manually using Scipy sparse vector

In [7]:
# Collect the terms and flatten them
title_terms = [t.split(" ") for t in movie_titles]
flattened_list = list(set([y for x in title_terms for y in x]))
vocab_dict = {}

idx = 0
for term in flattened_list:
    vocab_dict[term] = idx
    idx +=1

def create_vector(terms, term_dict):
    from scipy import sparse as sp
    num_terms = len(term_dict)
    x = sp.csc_matrix((1, num_terms))
    for t in terms:
        if t in term_dict:
            i = term_dict[t]
            x[0, i] = 1
    return x

In [8]:
vocab_dict_bcast = spark.sparkContext.broadcast(vocab_dict)
term_vectors = [create_vector(terms, vocab_dict_bcast.value) for terms in title_terms]
term_vectors



[<1x2645 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Column format>,
 <1x2645 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Column format>,
 <1x2645 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Column format>,
 <1x2645 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Column format>,
 <1x2645 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Column format>,
 <1x2645 sparse matrix of type '<class 'numpy.float64'>'
 	with 8 stored elements in Compressed Sparse Column format>,
 <1x2645 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Column format>,
 <1x2645 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Column format>,
 <1x2645 sparse matrix of type '<class 'numpy.fl

## Normalizing

In [9]:
x = np.random.randn(10)
norm_x_2 = np.linalg.norm(x)
normalized_x = x / norm_x_2
print("x:\n%s" % x)
print("L2-Norm of x: %2.4f" % norm_x_2)
print("Normalized x:\n%s" % normalized_x)
print("L2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x))

x:
[ 0.2981295  -0.58365304 -0.334499   -0.04232306 -1.8801449   1.26977739
  1.11841315 -0.9676302  -0.8436      0.12243619]
L2-Norm of x: 2.9333
Normalized x:
[ 0.10163591 -0.19897429 -0.11403471 -0.01442844 -0.64096387  0.43288229
  0.38128041 -0.3298767  -0.28759332  0.04173996]
L2-Norm of normalized_x: 1.0000
