In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/bigdata/project')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pyspark 
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

openjdk-8-jdk-headless is already the newest version (8u282-b08-0ubuntu1~18.04).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [None]:
# create the session
conf = SparkConf().set("spark.project", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [None]:
df = spark.read.csv('netflix_titles.txt',inferSchema =True, header=True,sep='\t')

In [None]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- show_id: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [None]:
df.show()

+---+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+--------+--------+--------------------+--------------------+
|_c0| show_id|   type|               title|            director|                cast|             country|date_added|release_year|  rating|duration|           listed_in|         description|
+---+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+--------+--------+--------------------+--------------------+
|  0|81145628|  Movie|Norm of the North...|Richard Finn, Tim...|Alan Marriott, An...|United States, In...|  9-Sep-19|        2019|   TV-PG|  90 min|Children & Family...|Before planning a...|
|  1|80117401|  Movie|Jandino: Whatever...|                null|    Jandino Asporaat|      United Kingdom|  9-Sep-16|        2016|   TV-MA|  94 min|     Stand-Up Comedy|"Jandino Asporaat...|
|  2|70234439|TV Show|  Transformers Prime|  

In [None]:
c.isnull().sum()

_c0                0
show_id            0
type               0
title              0
director        1969
cast             570
country          476
date_added        11
release_year       0
rating            10
duration           0
listed_in          0
description        0
dtype: int64

In [None]:
df=df.na.fill({'director': "unknown", 'cast': 'unknown','country': 'unknown','date_added': 'unknown','rating': 'unknown'})
df.show()

+---+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+--------+--------+--------------------+--------------------+
|_c0| show_id|   type|               title|            director|                cast|             country|date_added|release_year|  rating|duration|           listed_in|         description|
+---+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+--------+--------+--------------------+--------------------+
|  0|81145628|  Movie|Norm of the North...|Richard Finn, Tim...|Alan Marriott, An...|United States, In...|  9-Sep-19|        2019|   TV-PG|  90 min|Children & Family...|Before planning a...|
|  1|80117401|  Movie|Jandino: Whatever...|             unknown|    Jandino Asporaat|      United Kingdom|  9-Sep-16|        2016|   TV-MA|  94 min|     Stand-Up Comedy|"Jandino Asporaat...|
|  2|70234439|TV Show|  Transformers Prime|  

In [None]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

#tokenize split the words 
regexTokenizer = RegexTokenizer(inputCol="description", outputCol="words", pattern="\\W")
df_word = regexTokenizer.transform(df)
df_word.select("description", "words").show(truncate=False)


+------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|description                                                                                                                                           |words                                                                                                                                                                            |
+------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Before

In [None]:
# delete stop words 
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover(inputCol="words", outputCol="words2")
df_word=remover.transform(df_word)
df_word.show(truncate=False)


+---+--------+-------+--------------------------------------------------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------+----------+------------+--------+--------+-----------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+
|_c0|show_id |type   |title                                             |director        

In [None]:
# words into vector
from pyspark.ml.feature import Word2Vec
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="words2", outputCol="wordvec")
word2vec_model = word2Vec.fit(df_word)
df_word = word2vec_model.transform(df_word)
df_word.show()

+---+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+--------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0| show_id|   type|               title|            director|                cast|             country|date_added|release_year|  rating|duration|           listed_in|         description|               words|              words2|             wordvec|
+---+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+--------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|81145628|  Movie|Norm of the North...|Richard Finn, Tim...|Alan Marriott, An...|United States, In...|  9-Sep-19|        2019|   TV-PG|  90 min|Children & Family...|Before planning a...|[before, planning...|[planning, awesom...|[-0.08

In [None]:
# normalization 
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="wordvec", outputCol="scaledfeatures", withStd=True, withMean=False)
scalerModel = scaler.fit(df_word)
df_word = scalerModel.transform(df_word)
df_word.show()


+---+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+--------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0| show_id|   type|               title|            director|                cast|             country|date_added|release_year|  rating|duration|           listed_in|         description|               words|              words2|             wordvec|      scaledfeatures|
+---+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+--------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|81145628|  Movie|Norm of the North...|Richard Finn, Tim...|Alan Marriott, An...|United States, In...|  9-Sep-19|        2019|   TV-PG|  90 min|Children & Family...|Before

In [None]:
# Trains a k-means model with 200k 
from pyspark.ml.clustering import KMeans
kmeans = KMeans(featuresCol='scaledfeatures',k=200)
kmeans_model = kmeans.fit(df_word)


In [None]:
df_word = kmeans_model.transform(df_word)
df_word.show()

+---+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+--------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|_c0| show_id|   type|               title|            director|                cast|             country|date_added|release_year|  rating|duration|           listed_in|         description|               words|              words2|             wordvec|      scaledfeatures|prediction|
+---+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+--------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  0|81145628|  Movie|Norm of the North...|Richard Finn, Tim...|Alan Marriott, An...|United States, In...|  9-Sep-19|        2019|   TV-PG|  9

In [None]:
df_word.groupBy('prediction').count().sort(desc("count")).show()


+----------+-----+
|prediction|count|
+----------+-----+
|        57|   80|
|        30|   70|
|        49|   69|
|       104|   65|
|        76|   63|
|       174|   62|
|       118|   60|
|        82|   60|
|        24|   60|
|        96|   59|
|       154|   59|
|       178|   59|
|         3|   58|
|        58|   57|
|         7|   56|
|        66|   56|
|       194|   56|
|        36|   56|
|        15|   56|
|       147|   55|
+----------+-----+
only showing top 20 rows



In [None]:
df_word=df_word.drop("_c0")
df_word.show()


+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+--------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| show_id|   type|               title|            director|                cast|             country|date_added|release_year|  rating|duration|           listed_in|         description|               words|              words2|             wordvec|      scaledfeatures|prediction|
+--------+-------+--------------------+--------------------+--------------------+--------------------+----------+------------+--------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|81145628|  Movie|Norm of the North...|Richard Finn, Tim...|Alan Marriott, An...|United States, In...|  9-Sep-19|        2019|   TV-PG|  90 min|Children &

In [None]:
recomm=df_word.select("title","words2")

In [None]:
recomm.show()

+--------------------+--------------------+
|               title|              words2|
+--------------------+--------------------+
|Norm of the North...|[planning, awesom...|
|Jandino: Whatever...|[jandino, asporaa...|
|  Transformers Prime|[help, three, hum...|
|Transformers: Rob...|[prison, ship, cr...|
|        #realityhigh|[nerdy, high, sch...|
|             Apaches|[young, journalis...|
|            Automata|[dystopian, futur...|
|Fabrizio Copano: ...|[fabrizio, copano...|
|        Fire Chasers|[california, 2016...|
|         Good People|[struggling, coup...|
|Joaquín Reyes: Un...|[comedian, celebr...|
|Kidnapping Mr. He...|[beer, magnate, a...|
|Krish Trish and B...|[team, minstrels,...|
|Krish Trish and B...|[artisan, cheated...|
|Krish Trish and B...|[cat, monkey, don...|
|Krish Trish and B...|[three, comic, st...|
|Krish Trish and B...|[cat, monkey, don...|
|Krish Trish and B...|[animal, minstrel...|
|Krish Trish and B...|[consequences, tr...|
|                Love|[man, unsa

In [None]:
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="words2", outputCol="tf")
tf = hashingTF.transform(recomm)

idf = IDF(inputCol="tf", outputCol="feature").fit(tf)
tfidf = idf.transform(tf)

In [None]:
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="feature", outputCol="norm")
data = normalizer.transform(tfidf)

In [None]:
data.show(truncate=False)

+--------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
from pyspark.sql.types import DoubleType
import pyspark.sql.functions as psf
dot_udf = psf.udf(lambda x,y: float(x.dot(y)), DoubleType())
sim_recomm=data.alias("i").join(data.alias("j"), psf.col("i.title") < psf.col("j.title"))\
    .select(
        psf.col("i.title").alias("i"), 
        psf.col("j.title").alias("j"), 
        dot_udf("i.norm", "j.norm").alias("similar_score"))\
    .sort("i", "j")
sim_recomm.show(truncate=False)

+---------------------------------------------------+------------------------------------------------+--------------------+
|i                                                  |j                                               |similar_score       |
+---------------------------------------------------+------------------------------------------------+--------------------+
|"Behind ""The Cove"": The Quiet Japanese Speak Out"|"Escape from the ""Liberty"" Cinema"            |0.0                 |
|"Behind ""The Cove"": The Quiet Japanese Speak Out"|"Gabriel ""Fluffy"" Iglesias: One Show Fits All"|0.0                 |
|"Behind ""The Cove"": The Quiet Japanese Speak Out"|#Roxy                                           |0.0                 |
|"Behind ""The Cove"": The Quiet Japanese Speak Out"|#Rucker50                                       |0.021255304712596564|
|"Behind ""The Cove"": The Quiet Japanese Speak Out"|#Selfie                                         |0.0                 |
|"Behind

In [None]:
sim_recomm.where(sim_recomm.i=="#realityhigh").sort(desc("similar_score")).show(truncate=False,n=5)

+------------+-------------------------------+-------------------+
|i           |j                              |similar_score      |
+------------+-------------------------------+-------------------+
|#realityhigh|Follow Me                      |0.1464723723783308 |
|#realityhigh|Deadcon                        |0.12766177965085998|
|#realityhigh|Servant of the People          |0.12751544239467974|
|#realityhigh|How to Sell Drugs Online (Fast)|0.12657008519661497|
|#realityhigh|The World Is Yours             |0.12637306359157488|
+------------+-------------------------------+-------------------+
only showing top 5 rows



In [None]:
sim_recomm.count()

19428194

In [None]:
from pyspark.sql.functions import monotonically_increasing_id 

sim_recomm = sim_recomm.select("*").withColumn("id", monotonically_increasing_id())

sim_recomm.show()

+--------------------+--------------------+--------------------+---+
|                   i|                   j|       similar_score| id|
+--------------------+--------------------+--------------------+---+
|"Behind ""The Cov...|"Escape from the ...|                 0.0|  0|
|"Behind ""The Cov...|"Gabriel ""Fluffy...|                 0.0|  1|
|"Behind ""The Cov...|               #Roxy|                 0.0|  2|
|"Behind ""The Cov...|           #Rucker50|0.021255304712596564|  3|
|"Behind ""The Cov...|             #Selfie|                 0.0|  4|
|"Behind ""The Cov...|          #Selfie 69|                 0.0|  5|
|"Behind ""The Cov...|        #realityhigh|                 0.0|  6|
|"Behind ""The Cov...|                 '89|0.031379366621137356|  7|
|"Behind ""The Cov...|            (T)ERROR|                 0.0|  8|
|"Behind ""The Cov...|    1 Chance 2 Dance|                 0.0|  9|
|"Behind ""The Cov...|       1 Mile to You|                 0.0| 10|
|"Behind ""The Cov...|            

In [None]:
from pyspark.sql.functions import col
df1=sim_recomm.where(sim_recomm.id <=10000)
df1.show()

+--------------------+--------------------+--------------------+---+
|                   i|                   j|       similar_score| id|
+--------------------+--------------------+--------------------+---+
|"Behind ""The Cov...|"Escape from the ...|                 0.0|  0|
|"Behind ""The Cov...|"Gabriel ""Fluffy...|                 0.0|  1|
|"Behind ""The Cov...|               #Roxy|                 0.0|  2|
|"Behind ""The Cov...|           #Rucker50|0.021255304712596564|  3|
|"Behind ""The Cov...|             #Selfie|                 0.0|  4|
|"Behind ""The Cov...|          #Selfie 69|                 0.0|  5|
|"Behind ""The Cov...|        #realityhigh|                 0.0|  6|
|"Behind ""The Cov...|                 '89|0.031379366621137356|  7|
|"Behind ""The Cov...|            (T)ERROR|                 0.0|  8|
|"Behind ""The Cov...|    1 Chance 2 Dance|                 0.0|  9|
|"Behind ""The Cov...|       1 Mile to You|                 0.0| 10|
|"Behind ""The Cov...|            

In [None]:
#sim_recomm.createOrReplaceTempView("dfTable")

#spark.sql("SELECT * FROM dfTable WHERE id between 0 and 10000").show()

In [None]:
df1.count()

10001

In [None]:
df1.toPandas().to_csv('test1.csv')

In [None]:
recomm_test=pd.read_csv("test1.csv")

In [None]:
recomm_test.i.unique()

array(['"Behind ""The Cove"": The Quiet Japanese Speak Out"',
       '"Escape from the ""Liberty"" Cinema"'], dtype=object)

In [None]:
def recommdation(movie):
  a = str(movie)
  b = recomm_test.loc[recomm_test['i'] == a]
  b = recomm_test.sort_values(by=['similar_score'], ascending=False)
  return b.head(5)

In [None]:
recommdation('"Behind ""The Cove"": The Quiet Japanese Speak Out"')

Unnamed: 0.1,Unnamed: 0,i,j,similar_score,id
9870,9870,"""Escape from the """"Liberty"""" Cinema""",Nowhere Boy,0.151273,9870
2212,2212,"""Behind """"The Cove"""": The Quiet Japanese Speak...",Hot Girls Wanted,0.143513,2212
5939,5939,"""Behind """"The Cove"""": The Quiet Japanese Speak...",Virunga: Gorillas in Peril,0.138996,5939
363,363,"""Behind """"The Cove"""": The Quiet Japanese Speak...",An American in Madras,0.137852,363
5869,5869,"""Behind """"The Cove"""": The Quiet Japanese Speak...",Underdogs,0.135782,5869
