In [51]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import BooleanType

from langdetect import detect


In [52]:
spark = SparkSession.builder \
    .appName("EntityResolution") \
    .getOrCreate()

In [53]:
import pandas as pd
df = pd.read_csv('data/movie_name_data.csv')

In [54]:
movie_df = spark.read.csv("data/movie_name_data.csv", header=True, inferSchema=True, multiLine=True)
movie_df = spark.createDataFrame(df)
movie_df.show()

+---------+--------------------+--------------------+--------------------+-----+---------------+
|  imdb_id|               title|       plot_synopsis|                tags|split|synopsis_source|
+---------+--------------------+--------------------+--------------------+-----+---------------+
|tt0057603|I tre volti della...|Note: this synops...|cult, horror, got...|train|           imdb|
|tt1733125|Dungeons & Dragon...|Two thousand year...|            violence|train|           imdb|
|tt0033045|The Shop Around t...|Matuschek's, a gi...|            romantic| test|           imdb|
|tt0113862|  Mr. Holland's Opus|Glenn Holland, no...|inspiring, romant...|train|           imdb|
|tt0086250|            Scarface|In May 1980, a Cu...|cruelty, murder, ...|  val|           imdb|
|tt1315981|        A Single Man|George Falconer (...|romantic, queer, ...|  val|           imdb|
|tt0249380|           Baise-moi|Baise-moi tells t...|gothic, cruelty, ...|train|      wikipedia|
|tt0408790|          Flightpla

24/10/22 01:47:04 WARN TaskSetManager: Stage 155 contains a task of very large size (7196 KiB). The maximum recommended task size is 1000 KiB.


In [65]:
movie_df.printSchema()
print("\nTotal Number of movies = ", movie_df.count())

root
 |-- imdb_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- plot_synopsis: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- split: string (nullable = true)
 |-- synopsis_source: string (nullable = true)



24/10/22 01:53:38 WARN TaskSetManager: Stage 209 contains a task of very large size (7196 KiB). The maximum recommended task size is 1000 KiB.



Total Number of movies =  14828


In [56]:
def is_english(title):
    try:
        return detect(title) == "en"
    except:
        return False

is_english_udf = udf(is_english, BooleanType())

In [66]:
english_movies_df = movie_df.filter(is_english_udf(col("title")))

english_movies_df.show()
print("\nTotal Number of English movies = ", english_movies_df.count())

24/10/22 01:54:26 WARN TaskSetManager: Stage 212 contains a task of very large size (7196 KiB). The maximum recommended task size is 1000 KiB.
24/10/22 01:54:31 WARN TaskSetManager: Stage 213 contains a task of very large size (7196 KiB). The maximum recommended task size is 1000 KiB.


+---------+--------------------+--------------------+--------------------+-----+---------------+
|  imdb_id|               title|       plot_synopsis|                tags|split|synopsis_source|
+---------+--------------------+--------------------+--------------------+-----+---------------+
|tt1733125|Dungeons & Dragon...|Two thousand year...|            violence|train|           imdb|
|tt0033045|The Shop Around t...|Matuschek's, a gi...|            romantic| test|           imdb|
|tt0113862|  Mr. Holland's Opus|Glenn Holland, no...|inspiring, romant...|train|           imdb|
|tt0408790|          Flightplan|Kyle Pratt (Jodie...|mystery, suspense...|train|           imdb|
|tt0078908|           The Brood|At the Somafree I...|cult, psychedelic...|train|           imdb|
|tt0795493|   Cassandra's Dream|Brothers Terry (C...|tragedy, dramatic...|train|      wikipedia|
|tt0093389|    The Last Emperor|Arrival.\nA train...|              murder|train|           imdb|
|tt0120899|      My Life So Fa




Total Number of English movies =  8173


                                                                                

In [75]:
pandas_df = english_movies_df.toPandas()
pandas_df.to_csv('data/english_movies.csv', index=False)

24/10/22 02:26:44 WARN TaskSetManager: Stage 218 contains a task of very large size (7196 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [77]:
df2 = pd.read_csv('data/english_movies.csv')
df2

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source
0,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb
1,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb
2,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb
3,tt0408790,Flightplan,Kyle Pratt (Jodie Foster) is a propulsion engi...,"mystery, suspenseful, action, murder, flashback",train,imdb
4,tt0078908,The Brood,"At the Somafree Institute, Dr. Hal Raglan humi...","cult, psychedelic, murder, violence",train,imdb
...,...,...,...,...,...,...
8140,tt0074646,Hot Potato,"Hot Potato begins in Chang Lan, a fictional co...",blaxploitation,train,wikipedia
8141,tt0025601,One Night of Love,Opera singer Mary Barrett (Grace Moore) leaves...,romantic,val,wikipedia
8142,tt0063443,Play Dirty,During the North African Campaign in World War...,anti war,train,wikipedia
8143,tt0039464,High Wall,Steven Kenet catches his unfaithful wife in th...,murder,test,wikipedia
