In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

print("Checking Spark environment...")
print(f"Spark Version: {spark.version}")
print("✓ Spark is active and ready on Databricks Serverless!")
print("="*50)

try:
    df = spark.read.csv("/Volumes/workspace/default/netflix/netflix_clean_dataset_kshitij_.csv", 
                        header=True, 
                        inferSchema=True)
    print("✓ Dataset loaded successfully")
    print(f"Rows: {df.count()}, Columns: {len(df.columns)}")
except Exception as e:
    print(f"Error loading data: {e}")
    raise

Checking Spark environment...
Spark Version: 4.0.0
✓ Spark is active and ready on Databricks Serverless!
✓ Dataset loaded successfully
Rows: 4103, Columns: 16


In [0]:
df.printSchema()
display(df.limit(5))


root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)
 |-- duration_int: integer (nullable = true)
 |-- duration_type: string (nullable = true)
 |-- added_year: integer (nullable = true)
 |-- added_month: integer (nullable = true)



show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,duration_int,duration_type,added_year,added_month
s579,movie,Memoirs of a Geisha,Rob Marshall,"Zhang Ziyi, Ken Watanabe, Koji Yakusho, Michelle Yeoh, Kaori Momoi, Youki Kudoh, Gong Li, Cary-Hiroyuki Tagawa, Mako Iwamatsu, Navia Nguyen, Karl Yune","France, Japan, United States","July 1, 2021",2005,PG-13,145 min,"Dramas, Romantic Movies","Sold to a geisha house as a child, a fisherman’s daughter becomes the most sought-after geisha in Kyoto, but rivalries threaten the destiny she desires.",145,minutes,2021,7
s2322,movie,George Lopez: We'll Do It For Half,Troy Miller,George Lopez,United States,"June 30, 2020",2020,TV-MA,52 min,Stand-Up Comedy,"Comedian George Lopez tackles the future and the past of Latinx culture in America, touching on immigration, his tough relatives, aging and much more.",52,minutes,2020,6
s275,tv show,Grace and Frankie,Unknown,"Jane Fonda, Lily Tomlin, Martin Sheen, Sam Waterston, June Diane Raphael, Brooklyn Decker, Ethan Embry, Baron Vaughn",United States,"August 13, 2021",2021,TV-MA,7 Seasons,"TV Comedies, TV Dramas","They’re not friends, but when their husbands leave them for each other, proper Grace and eccentric Frankie begin to bond in this Emmy-nominated series.",7,seasons,2021,8
s350,movie,Seabiscuit,Gary Ross,"Tobey Maguire, Jeff Bridges, Chris Cooper, Elizabeth Banks, Gary Stevens, William H. Macy, David McCullough, Kingston DuCoeur, Eddie Jones, Ed Lauter, Michael O'Neill, Michael Angarano, Royce D. Applegate, Annie Corley, Valerie Mahaffey",United States,"August 1, 2021",2003,PG-13,141 min,"Dramas, Sports Movies",An ungainly-looking colt becomes a winning thoroughbred in this Depression-era drama based on the true story of champion racehorse Seabiscuit.,141,minutes,2021,8
s1775,movie,Hidden in Plain Sight,Stacia Crawford,"Victoria Barabas, Gino Anthony Pesi, Jake Allyn, Deborah Van Valkenburgh, Jessica Meraz, Jack Fisher, Eve Sigall, Jerod Meagher",United States,"October 30, 2020",2019,TV-14,87 min,"Dramas, Independent Movies, Thrillers",A woman stages her own suicide but still lives in fear of her abusive ex-boyfriend tracking her down and stealing the son he never knew he had.,87,minutes,2020,10


In [0]:
string_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, StringType)]
numeric_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, (IntegerType, LongType, DoubleType, FloatType))]

print(f"String columns: {string_cols}")
print(f"Numeric columns: {numeric_cols}")

String columns: ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'rating', 'duration', 'listed_in', 'description', 'duration_type']
Numeric columns: ['release_year', 'duration_int', 'added_year', 'added_month']


In [0]:

categorical_cols_to_fill = ['type', 'rating', 'duration_type', 'country', 'director', 'cast', 'listed_in']
numerical_cols_to_fill = ['release_year', 'duration_int', 'added_year', 'added_month']


In [0]:
for col_name in categorical_cols_to_fill:
    if col_name in df.columns and col_name in string_cols:
        df = df.withColumn(col_name, when(col(col_name).isNull(), "Unknown").otherwise(col(col_name)))


In [0]:
for col_name in numerical_cols_to_fill:
    if col_name in df.columns and col_name in numeric_cols:
        median_val = df.approxQuantile(col_name, [0.5], 0.01)[0] if df.filter(col(col_name).isNotNull()).count() > 0 else 0
        df = df.withColumn(col_name, when(col(col_name).isNull(), median_val).otherwise(col(col_name)))

print("✓ Null values handled")


✓ Null values handled


In [0]:
print("\n" + "="*50)
print("LABEL ENCODING")
print("="*50)

label_encode_cols = ['type', 'rating', 'duration_type']
label_encode_cols = [col_name for col_name in label_encode_cols if col_name in df.columns]

if label_encode_cols:
    for col_name in label_encode_cols:
        window = Window.orderBy(col(col_name))
        df = df.withColumn(f"{col_name}_label", dense_rank().over(window) - 1)
    
    print(f"✓ Label encoding applied to: {label_encode_cols}")
    display(df.select(label_encode_cols + [f"{c}_label" for c in label_encode_cols]).limit(10))
else:
    print("⚠ No columns found for label encoding")



LABEL ENCODING
✓ Label encoding applied to: ['type', 'rating', 'duration_type']




type,rating,duration_type,type_label,rating_label,duration_type_label
movie,G,minutes,0,0,0
movie,G,minutes,0,0,0
movie,G,minutes,0,0,0
movie,G,minutes,0,0,0
movie,G,minutes,0,0,0
movie,G,minutes,0,0,0
movie,G,minutes,0,0,0
movie,G,minutes,0,0,0
movie,G,minutes,0,0,0
movie,G,minutes,0,0,0


In [0]:

print("\n" + "="*50)
print("ONE-HOT ENCODING")
print("="*50)

onehot_cols = ['country']
onehot_cols = [col_name for col_name in onehot_cols if col_name in df.columns and col_name in string_cols]

if onehot_cols:
    for col_name in onehot_cols:
        unique_vals = [row[0] for row in df.select(col_name).distinct().limit(50).collect()]
        
        for val in unique_vals:
            safe_val = str(val).replace(" ", "_").replace("-", "_").replace(",", "")[:30]
            df = df.withColumn(
                f"{col_name}_onehot_{safe_val}", 
                when(col(col_name) == val, 1).otherwise(0)
            )
        
        print(f"✓ One-hot encoding applied to: {col_name} ({len(unique_vals)} categories)")
    
    onehot_sample_cols = [c for c in df.columns if any(x in c for x in onehot_cols) and 'onehot' in c][:5]
    display(df.select(onehot_cols + onehot_sample_cols).limit(5))
else:
    print("⚠ No columns found for one-hot encoding")



ONE-HOT ENCODING




✓ One-hot encoding applied to: country (50 categories)


country,country_onehot_South_Korea,country_onehot_France_Belgium_Luxembourg_Roma,country_onehot_Philippines_Qatar,country_onehot_France_Belgium_Italy,country_onehot_United_Kingdom_Spain
United States,0,0,0,0,0
United States,0,0,0,0,0
United States,0,0,0,0,0
"France, Japan, United States",0,0,0,0,0
United States,0,0,0,0,0


In [0]:
print("\n" + "="*50)
print("FREQUENCY ENCODING")
print("="*50)

freq_encode_cols = ['director', 'cast', 'listed_in']
freq_encode_cols = [col_name for col_name in freq_encode_cols if col_name in df.columns]

if freq_encode_cols:
    for col_name in freq_encode_cols:
        freq_df = df.groupBy(col_name).agg(count("*").alias("count"))
        total_count = df.count()
        
        freq_df = freq_df.withColumn(f"{col_name}_freq", col("count") / total_count)
        
        df = df.join(
            freq_df.select(col_name, f"{col_name}_freq"), 
            on=col_name, 
            how="left"
        )
        
        print(f"✓ Frequency encoding applied to: {col_name}")
        display(df.select(col_name, f"{col_name}_freq").distinct().orderBy(col(f"{col_name}_freq").desc()).limit(5))
else:
    print("⚠ No columns found for frequency encoding")


FREQUENCY ENCODING




✓ Frequency encoding applied to: director


director,director_freq
Unknown,0.3022178893492566
Rajiv Chilaka,0.0021935169388252
Marcus Raboy,0.0019497928345113
"Raúl Campos, Jan Suter",0.0019497928345113
Steven Spielberg,0.0019497928345113


✓ Frequency encoding applied to: cast


cast,cast_freq
Unknown,0.0957835729953692
David Attenborough,0.0019497928345113
"Vatsal Dubey, Julie Tejwani, Rupa Bhimani, Jigna Bhardwaj, Rajesh Kava, Mousam, Swapnil",0.0017060687301974
"David Spade, London Hughes, Fortune Feimster",0.0012186205215695
Iliza Shlesinger,0.0009748964172556666


✓ Frequency encoding applied to: listed_in


listed_in,listed_in_freq
Documentaries,0.0402144772117962
Stand-Up Comedy,0.0389958566902266
"Dramas, International Movies",0.037777236168657
"Comedies, Dramas, International Movies",0.0329027540823787
"Dramas, Independent Movies, International Movies",0.0272970996831586


In [0]:
print("\n" + "="*50)
print("MIN-MAX NORMALIZATION")
print("="*50)

numerical_cols = ['release_year', 'duration_int', 'added_year']
numerical_cols = [col_name for col_name in numerical_cols if col_name in df.columns]

if numerical_cols:
    for col_name in numerical_cols:
        median_val = df.approxQuantile(col_name, [0.5], 0.01)[0]
        df = df.withColumn(col_name, when(col(col_name).isNull(), median_val).otherwise(col(col_name)))
    
    for col_name in numerical_cols:
        min_max = df.agg(min(col(col_name)).alias("min"), max(col(col_name)).alias("max")).collect()[0]
        min_val = min_max["min"]
        max_val = min_max["max"]
        
        if max_val != min_val:
            df = df.withColumn(
                f"{col_name}_normalized",
                (col(col_name) - min_val) / (max_val - min_val)
            )
        else:
            df = df.withColumn(f"{col_name}_normalized", lit(0.5))
    
    print(f"✓ Normalization applied to: {numerical_cols}")
    norm_cols = [f"{c}_normalized" for c in numerical_cols]
    display(df.select(numerical_cols + norm_cols).limit(10))
else:
    print("⚠ No numerical columns found for normalization")


MIN-MAX NORMALIZATION




✓ Normalization applied to: ['release_year', 'duration_int', 'added_year']


release_year,duration_int,added_year,release_year_normalized,duration_int_normalized,added_year_normalized
2005.0,145.0,2021.0,0.8333333333333334,0.6101694915254238,1.0
2020.0,52.0,2020.0,0.9895833333333334,0.2161016949152542,0.9230769230769232
2003.0,141.0,2021.0,0.8125,0.5932203389830508,1.0
2019.0,87.0,2020.0,0.9791666666666666,0.3644067796610169,0.9230769230769232
2020.0,104.0,2020.0,0.9895833333333334,0.4364406779661017,0.9230769230769232
1991.0,105.0,2021.0,0.6875,0.4406779661016949,1.0
2005.0,87.0,2020.0,0.8333333333333334,0.3644067796610169,0.9230769230769232
1979.0,237.0,2020.0,0.5625,1.0,0.9230769230769232
1997.0,125.0,2021.0,0.75,0.5254237288135594,1.0
2021.0,83.0,2021.0,1.0,0.3474576271186441,1.0


In [0]:
print("\n" + "="*50)
print("STANDARDIZATION (Z-SCORE)")
print("="*50)

if numerical_cols:
    for col_name in numerical_cols:
        stats = df.agg(
            mean(col(col_name)).alias("mean"),
            stddev(col(col_name)).alias("stddev")
        ).collect()[0]
        
        mean_val = stats["mean"]
        stddev_val = stats["stddev"]
        
        if stddev_val and stddev_val != 0:
            df = df.withColumn(
                f"{col_name}_standardized",
                (col(col_name) - mean_val) / stddev_val
            )
        else:
            df = df.withColumn(f"{col_name}_standardized", lit(0.0))
    
    print(f"✓ Standardization applied to: {numerical_cols}")
    std_cols = [f"{c}_standardized" for c in numerical_cols]
    display(df.select(numerical_cols + std_cols).limit(10))
else:
    print("⚠ Standardization skipped")



STANDARDIZATION (Z-SCORE)




✓ Standardization applied to: ['release_year', 'duration_int', 'added_year']


release_year,duration_int,added_year,release_year_standardized,duration_int_standardized,added_year_standardized
2005.0,145.0,2021.0,-1.0342800820729392,1.4736807878731353,1.3435264790132986
2020.0,52.0,2020.0,0.6523410369005618,-0.3428511973828509,0.716180497063682
2003.0,141.0,2021.0,-1.2591628979360725,1.395550379905136,1.3435264790132986
2019.0,87.0,2020.0,0.5398996289689951,0.3407898723371438,0.716180497063682
2020.0,104.0,2020.0,0.6523410369005618,0.6728441062011413,0.716180497063682
1991.0,105.0,2021.0,-2.608459793114873,0.6923767081931411,1.3435264790132986
2005.0,87.0,2020.0,-1.0342800820729392,0.3407898723371438,0.716180497063682
1979.0,237.0,2020.0,-3.957756688293674,3.2706801711371214,0.716180497063682
1997.0,125.0,2021.0,-1.9338113455254728,1.0830287480331382,1.3435264790132986
2021.0,83.0,2021.0,0.7647824448321285,0.2626594643691444,1.3435264790132986


In [0]:
print("\n" + "="*50)
print("FINAL PROCESSED DATASET")
print("="*50)

print(f"Total Rows: {df.count()}")
print(f"Total Columns: {len(df.columns)}")
print("\nEncoded Column Summary:")
print(f"  - Label Encoded: {len([c for c in df.columns if '_label' in c])} columns")
print(f"  - One-Hot Encoded: {len([c for c in df.columns if '_onehot_' in c])} columns")
print(f"  - Frequency Encoded: {len([c for c in df.columns if '_freq' in c])} columns")
print(f"  - Normalized: {len([c for c in df.columns if '_normalized' in c])} columns")
print(f"  - Standardized: {len([c for c in df.columns if '_standardized' in c])} columns")



FINAL PROCESSED DATASET




Total Rows: 4103
Total Columns: 78

Encoded Column Summary:
  - Label Encoded: 3 columns
  - One-Hot Encoded: 50 columns
  - Frequency Encoded: 3 columns
  - Normalized: 3 columns
  - Standardized: 3 columns


In [0]:
display(df.limit(5))



listed_in,cast,director,show_id,type,title,country,date_added,release_year,rating,duration,description,duration_int,duration_type,added_year,added_month,type_label,rating_label,duration_type_label,country_onehot_South_Korea,country_onehot_France_Belgium_Luxembourg_Roma,country_onehot_Philippines_Qatar,country_onehot_France_Belgium_Italy,country_onehot_United_Kingdom_Spain,country_onehot_United_Kingdom_United_States_I,country_onehot_Italy_United_States,country_onehot_Thailand,country_onehot_United_States_United_Kingdom_F,country_onehot_Poland,country_onehot_Iceland,country_onehot_United_Kingdom_Germany_United_,country_onehot_United_States_Spain,country_onehot_China_United_States_Canada,country_onehot_Chile_Brazil,country_onehot_Lebanon,country_onehot_United_States_Colombia_Mexico,country_onehot_Italy,country_onehot_Thailand_China_United_States,country_onehot_Turkey_United_States,country_onehot_Belgium,country_onehot_United_States_Japan,country_onehot_China_Hong_Kong,country_onehot_Ireland,country_onehot_Philippines_Singapore,country_onehot_United_Kingdom_France_Belgium_,country_onehot_United_Kingdom_West_Germany,country_onehot_United_States_Philippines,country_onehot_Chile_France,country_onehot_United_States_France_Italy_Uni,country_onehot_India_United_States,country_onehot_South_Korea_United_States,country_onehot_France_United_States,country_onehot_Malaysia,country_onehot_South_Korea_Canada_United_Stat,country_onehot_Luxembourg,country_onehot_United_States_Denmark,country_onehot_Italy_Switzerland_Albania_Pola,country_onehot_United_States_Germany_Canada,country_onehot_Germany,country_onehot_Bangladesh,country_onehot_United_States_Czech_Republic,country_onehot_Hong_Kong_China,country_onehot_United_Kingdom_India,country_onehot_Pakistan,country_onehot_Mauritius_South_Africa,country_onehot_Denmark,country_onehot_United_States_India,country_onehot_United_Kingdom_Canada,country_onehot_Kenya,director_freq,cast_freq,listed_in_freq,release_year_normalized,duration_int_normalized,added_year_normalized,release_year_standardized,duration_int_standardized,added_year_standardized
"Children & Family Movies, Comedies","Elizabeth Daily, Christine Cavanaugh, Kath Soucie, Melanie Chartoff, Phil Proctor, Cree Summer, Cheryl Chase, Tara Strong, Jack Riley, Joe Alaskey, Michael Bell, Tress MacNeille, Busta Rhymes, Whoopi Goldberg, David Spade","Igor Kovalyov, Norton Virgien",s3476,movie,The Rugrats Movie,United States,"October 1, 2019",1998.0,G,81 min,"When his baby brother Dil is born, Tommy Pickles and his pals decide that he's too much responsibility and try to return him to the hospital.",81.0,minutes,2019.0,10.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00024372410431391665,0.00024372410431391665,0.0263222032659029,0.7604166666666666,0.3389830508474576,0.8461538461538461,-1.8213699375939063,0.2235942603851447,0.0888345151140655
"Children & Family Movies, Dramas","Richard Gere, Joan Allen, Cary-Hiroyuki Tagawa, Sarah Roemer, Jason Alexander, Erick Avari, Davenia McFadden, Robbie Sublett, Kevin DeCoste, Rob Degnan",Lasse Hallström,s934,movie,Hachi: A Dog's Tale,"United Kingdom, United States","May 1, 2021",2009.0,G,93 min,"When his master dies, a loyal pooch named Hachiko keeps a vigil for more than a decade at the train station where he once greeted his owner every day.",93.0,minutes,2021.0,5.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0009748964172556666,0.00024372410431391665,0.002680965147453,0.875,0.3898305084745763,1.0,-0.5845144503466722,0.4579854842891429,1.3435264790132986
Children & Family Movies,"Maurice LaMarche, Sean Astin, Keith Carradine, Charles Fleischer, Jodi Benson, Bill Fagerbakke, Charity James, Kathy Najimy, David Paymer, Jean Smart",Phil Weinstein,s6223,movie,Balto 3: Wings of Change,United States,"April 1, 2018",2004.0,G,78 min,"A pilot challenges Balto's son Kodi and his sled team to see who can deliver the mail faster, but the dogs race to the rescue when the plane vanishes.",78.0,minutes,2018.0,4.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0004874482086278333,0.00024372410431391665,0.0229100658055081,0.8229166666666666,0.326271186440678,0.7692307692307693,-1.1467214900045055,0.1649964544091451,-0.5385114668355511
"Children & Family Movies, Comedies","Justin Fletcher, John Sparkes, Amalia Vitale, Kate Harbour, David Holt","Richard Phelan, Will Becher",s2912,movie,A Shaun the Sheep Movie: Farmageddon,"United Kingdom, France, Belgium, Ireland, United States","February 14, 2020",2019.0,G,87 min,Shaun and the flock race to help an adorable alien find her way home after her ship crash-lands near Mossy Bottom Farm and sparks a UFO frenzy.,87.0,minutes,2020.0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00024372410431391665,0.00024372410431391665,0.0263222032659029,0.9791666666666666,0.3644067796610169,0.9230769230769232,0.5398996289689951,0.3407898723371438,0.716180497063682
Children & Family Movies,"Sonja Ball, Craig Francis, Rick Jones, Michel Perron, Holly Gauthier-Frankel, Norman Groulx, Terrence Scammell, Bruce Dinsmore, Jennifer Seguin",Bernie Denk,s3545,movie,Spookley the Square Pumpkin,United States,"September 1, 2019",2004.0,G,47 min,"When a storm terrorizes his patch, a pumpkin cast out for his shape must step up to use his smarts to save the day – and to prove himself.",47.0,minutes,2019.0,9.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0004874482086278333,0.00024372410431391665,0.0229100658055081,0.8229166666666666,0.1949152542372881,0.8461538461538461,-1.1467214900045055,-0.4405142073428502,0.0888345151140655


In [0]:
print("\n" + "="*50)
print("SAVING PROCESSED DATA")
print("="*50)

try:
    output_path = "/Volumes/workspace/default/netflix/netflix_processed"
    df.write.format("delta").mode("overwrite").save(output_path)
    print(f"✓ Data saved successfully to: {output_path}")
except Exception as e:
    print(f"⚠ Error saving data: {e}")
    print("Attempting to save to alternative location...")
    try:
        df.write.format("parquet").mode("overwrite").save("/tmp/netflix_processed")
        print("✓ Data saved to /tmp/netflix_processed")
    except Exception as e2:
        print(f"✗ Failed to save: {e2}")

print("\n" + "="*50)
print("PROCESSING COMPLETE!")
print("="*50)
print("\n✓ All encodings completed successfully using Databricks Serverless!")


SAVING PROCESSED DATA




✓ Data saved successfully to: /Volumes/workspace/default/netflix/netflix_processed

PROCESSING COMPLETE!

✓ All encodings completed successfully using Databricks Serverless!
