In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
# create the session
conf = SparkConf().set("spark.project", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
df = spark.read.csv('data/netflix_titles.txt',inferSchema =True, header=True,sep='\t')
#tokenize split the words 
regexTokenizer = RegexTokenizer(inputCol="description", outputCol="words", pattern="\\W")
df_word = regexTokenizer.transform(df)
# df_word.select("description", "words").show(truncate=False)
# delete stop words 
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover(inputCol="words", outputCol="words2")
df_word=remover.transform(df_word)
# df_word.show(truncate=False)
# words into vector
from pyspark.ml.feature import Word2Vec
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="words2", outputCol="wordvec")
word2vec_model = word2Vec.fit(df_word)
df_word = word2vec_model.transform(df_word)
# df_word.show()
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="wordvec", outputCol="scaledfeatures", withStd=True, withMean=False)
scalerModel = scaler.fit(df_word)
df_word = scalerModel.transform(df_word)
# df_word.show()
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="words2", outputCol="tf")
tf = hashingTF.transform(df_word)

idf = IDF(inputCol="tf", outputCol="feature").fit(tf)
tfidf = idf.transform(tf)

from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="feature", outputCol="norm")
data = normalizer.transform(tfidf)
df_rec = data.toPandas()
sc.stop()

In [None]:
def rec_by_content(name, df_rec,k=5):
    j = df_rec.title.tolist().index(name)
#     j = 4
    sim_list = []
    for i in range(df_rec.shape[0]):
        sim_list.append(df_rec.norm[j].dot(df_rec.norm[i]))
    df_rec['sim'] = sim_list
    return df_rec.sort_values('sim',ascending=False).head(k)

In [None]:
sample_show = df_rec.title.values[-20:]
rec_by_content(sample_show[0],df_rec,5)

Unnamed: 0,_c0,show_id,type,title,director,cast,country,date_added,release_year,rating,...,listed_in,description,words,words2,wordvec,scaledfeatures,tf,feature,norm,sim
6214,6214,80049872,TV Show,Chelsea,,,United States,14-Apr-17,2017,TV-MA,...,"Stand-Up Comedy & Talk Shows, TV Comedies","It's not her first talk show, but it is a firs...","[it, s, not, her, first, talk, show, but, it, ...","[first, talk, show, first, kind, ideas, people...","[-0.04827464444679208, 0.06424329108135267, 0....","[-1.1794870457388804, 2.0440227528859287, 1.43...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
4966,4966,81006335,Movie,Super Monsters Back to School,,"Elyse Maloway, Vincent Tong, Erin Matthews, An...",Canada,16-Aug-19,2019,TV-Y,...,Children & Family Movies,The Super Monsters welcome Vida to her new hom...,"[the, super, monsters, welcome, vida, to, her,...","[super, monsters, welcome, vida, new, home, pi...","[-0.14791403313477833, 0.07150058100620905, 0....","[-3.6139610755238993, 2.2749272641742992, 2.33...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.183033
1668,1668,80134781,Movie,Hasan Minhaj: Homecoming King,Christopher Storer,Hasan Minhaj,,23-May-17,2017,TV-MA,...,Stand-Up Comedy,"""Comic Hasan Minhaj of """"The Daily Show"""" shar...","[comic, hasan, minhaj, of, the, daily, show, s...","[comic, hasan, minhaj, daily, show, shares, pe...","[-0.09433003017329611, 0.08292912179604173, 0....","[-2.3047512806892136, 2.638548072098373, 0.775...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.153014
874,874,70140322,Movie,Kevin Hart: Seriously Funny,Shannon Hartman,Kevin Hart,United States,1-Oct-18,2010,TV-MA,...,Stand-Up Comedy,"With his unique hip-hop style delivery, Africa...","[with, his, unique, hip, hop, style, delivery,...","[unique, hip, hop, style, delivery, african, a...","[-0.00423615169711411, 0.06274714780738577, 0....","[-0.10350125014463757, 1.9964201029891018, 0.5...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.149077
4402,4402,80092839,Movie,Fireplace 4K: Classic Crackling Fireplace from...,George Ford,,,21-Dec-15,2015,,...,Movies,"The first of its kind in UHD 4K, with the clea...","[the, first, of, its, kind, in, uhd, 4k, with,...","[first, kind, uhd, 4k, clearest, picture, avai...","[-0.055917355604469775, 0.03856590427458286, 0...","[-1.36622024508415, 1.2270477507611568, 0.6710...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.130335
