### Import Libraries

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, desc, avg, max, min
import os

### Initializing different variables 

In [11]:
def initialize_spark_session():
    """
    Initialize and return a Spark session.
    This will be used for all data processing tasks.
    """
    return SparkSession.builder \
        .appName("Netflix Dataset Analysis") \
        .getOrCreate()

def load_dataset(spark, file_path):
    """
    Load a dataset from a CSV file into a Spark DataFrame.

    Args:
        spark (SparkSession): The active Spark session.
        file_path (str): The path to the CSV file containing the dataset.

    Returns:
        DataFrame: The loaded dataset as a Spark DataFrame.
    """
    return spark.read.csv(file_path, header=True, inferSchema=True)

def exploratory_data_analysis(df):
    """
    Perform Exploratory Data Analysis (EDA) on the provided dataset.

    Args:
        df (DataFrame): The dataset to analyze.
    """
    # Display the schema of the dataset
    print("1. Dataset Schema:")
    df.printSchema()

    # Display the total number of records in the dataset
    print(f"\n2. Total Number of Records: {df.count()}")

    # Identify the most common genres
    print("\n3. Top Genres by Content Count:")
    top_genres = df.groupBy("listed_in") \
        .agg(count("*").alias("content_count")) \
        .orderBy(col("content_count").desc()) \
        .limit(10)
    top_genres.show(truncate=False)

    # Identify directors with the highest number of titles
    print("\n4. Directors with the Most Titles:")
    popular_directors = df.groupBy("director") \
        .agg(count("*").alias("title_count")) \
        .orderBy(col("title_count").desc()) \
        .limit(10)
    popular_directors.show()

    # List the titles with the longest durations (movies or TV shows)
    print("\n5. Titles with the Longest Durations:")
    longest_titles = df.filter(col("duration").isNotNull()) \
        .select("title", "duration", "type") \
        .orderBy(desc("duration")) \
        .limit(10)
    longest_titles.show()

    # Calculate the average release year for each content type
    print("\n6. Average Release Year by Content Type:")
    avg_release_year = df.groupBy("type") \
        .agg(avg("release_year").alias("average_release_year")) \
        .orderBy("type")
    avg_release_year.show()


### Initialize spark session

In [12]:
spark = create_spark_session()

In [13]:
 # Get the current directory
dataset_path = 'netflix_titles.csv'  # Use the relative path to your dataset

# Load Netflix Dataset
netflix_df = load_netflix_dataset(spark, dataset_path)

# Perform EDA
perform_eda(netflix_df)

1. Basic Dataset Information:
root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)


Total number of records: 8809

2. Most Frequent Genres:
+------------------------------------------------+-------------+
|listed_in                                       |content_count|
+------------------------------------------------+-------------+
|Dramas, International Movies                    |361          |
|Documentaries                                   |358          |
|Stand-Up Comedy                                 |334          |
|Comedies, Dramas, International Movies   

In [14]:
spark.stop()