In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder.appName("IMDBtext").getOrCreate()

# Load all CSV files from a folder
df_spark = spark.read.option("header", True) \
                     .option("inferSchema", True) \
                     .option("quote", '"') \
                     .option("escape", '"') \
                     .option("multiLine", True) \
                     .csv("/content/drive/MyDrive/CS5344_FP/processed_movies_combined_final.csv")

df_spark.show()

+------+--------------------+----------+------------+----------+----------+-------+---------+------------+-------------+--------------------+--------------------+--------------------+--------------------+
|    id|               title|popularity|vote_average|vote_count|   revenue|runtime|   budget|release_year|release_month|original_language_en|         poster_path|      processed_text|     poster_features|
+------+--------------------+----------+------------+----------+----------+-------+---------+------------+-------------+--------------------+--------------------+--------------------+--------------------+
| 27205|           Inception|    83.952|       8.364|     34495| 825532764|    148|160000000|        2010|            7|                   1|https://image.tmd...|\cobb skilled thi...|-0.11434663087129...|
|157336|        Interstellar|   140.241|       8.417|     32571| 701729206|    169|165000000|        2014|           11|                   1|https://image.tmd...|adventures group .

In [3]:
from pyspark.sql.functions import lower, regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import concat_ws, split, coalesce, lit

# Replace null values with empty strings first
df_spark = df_spark.withColumn("processed_text_filledna", coalesce(df_spark["processed_text"], lit("")))
df_spark = df_spark.withColumn("title_filledna", coalesce(df_spark["title"], lit("")))

# Convert text to lowercase and remove special characters
df_spark = df_spark.withColumn("title_clean", lower(regexp_replace(df_spark["title_filledna"], "[^a-zA-Z\s]", "")))
df_spark = df_spark.withColumn("processed_text_clean", lower(regexp_replace(df_spark["processed_text_filledna"], "[^a-zA-Z\s]", "")))

# Tokenize text
tokenizer = Tokenizer(inputCol="processed_text_clean", outputCol="processed_text_tokens")
df_spark = tokenizer.transform(df_spark)

tokenizer = Tokenizer(inputCol="title_clean", outputCol="title_tokens")
df_spark = tokenizer.transform(df_spark)

df_spark.select("title_tokens", "processed_text_tokens").show(5, truncate=False)

+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|title_tokens       |processed_text_tokens                                                                                                                                                                                                                                                                                                                                            

In [4]:
from pyspark.ml.feature import Word2Vec

# Start Spark session
spark = SparkSession.builder.appName("Word2Vec").getOrCreate()

# Train Word2Vec model
word2vec = Word2Vec(vectorSize=100, minCount=2, inputCol="title_tokens", outputCol="title_vec")
model = word2vec.fit(df_spark)
df_spark = model.transform(df_spark)

In [5]:
word2vec = Word2Vec(vectorSize=50, minCount=2, inputCol="processed_text_tokens", outputCol="processed_text_vec")
model = word2vec.fit(df_spark)
df_spark = model.transform(df_spark)

In [6]:
df_spark.select("title_vec", "processed_text_vec").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
from pyspark.sql.functions import concat, lit, col

# Convert array columns to a string in the format ["element1", "element2"]
df_spark = df_spark.withColumn("title_tokens", concat(lit("["), concat_ws(", ", "title_tokens"), lit("]")))
df_spark = df_spark.withColumn("processed_text_tokens", concat(lit("["), concat_ws(", ", "processed_text_tokens"), lit("]")))

# Show sample output
df_spark.select("title_tokens", "processed_text_tokens").show(5, truncate=False)

+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|title_tokens       |processed_text_tokens                                                                                                                                                                                                                                                                                                                                            

In [8]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import numpy as np

# Function to convert the vector to a string
def vector_to_str(vec):
    return "[" + ", ".join([f"{v:.3f}" for v in vec]) + "]"  # Format to 3 decimal places

# Register the UDF
vector_to_str_udf = udf(vector_to_str, StringType())

# Apply the UDF to convert the vector into a string
df_spark = df_spark.withColumn("title_vec_str", vector_to_str_udf(df_spark["title_vec"]))
df_spark = df_spark.withColumn("processed_text_vec_str", vector_to_str_udf(df_spark["processed_text_vec"]))

# Show the output
df_spark.select("title_vec_str", "processed_text_vec_str").show(5, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
# drop columns

df_final = df_spark.drop("processed_text_filledna", "title_filledna", "processed_text_clean", "title_clean")

df_final.show(truncate=False)

+------+-------------------------------------------------+----------+------------+----------+----------+-------+---------+------------+-------------+--------------------+---------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
# Drop 'title_vec' and 'processed_text_vec' columns
df_final_save = df_final.drop("title_vec", "processed_text_vec")

print("Columns before saving:", df_final_save.columns)

Columns before saving: ['id', 'title', 'popularity', 'vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'release_year', 'release_month', 'original_language_en', 'poster_path', 'processed_text', 'poster_features', 'processed_text_tokens', 'title_tokens', 'title_vec_str', 'processed_text_vec_str']


In [11]:
# Coalesce the DataFrame to a single partition before saving to a single CSV file
df_final_save.coalesce(1).write.csv("/content/drive/MyDrive/CS5344_FP/processed_movies/processed_movies_combined_imageNtext_final",
                               header=True, mode="overwrite")