In [None]:
pip install pyspark




In [None]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("MovieRatingsAnalysis").getOrCreate()

# Load the movie dataset (movies.csv)
movies_file_path = "/content/movies.csv"  # Update with the actual path to your movies CSV file
movies_df = spark.read.option("header", "true").csv(movies_file_path, inferSchema=True)

# Load the ratings dataset (ratings.csv)
ratings_file_path = "/content/ratings.csv"  # Update with the actual path to your ratings CSV file
ratings_df = spark.read.option("header", "true").csv(ratings_file_path, inferSchema=True)

# Show the schema to understand the structure of the data
movies_df.printSchema()
ratings_df.printSchema()

# Create temporary views for SQL queries
movies_df.createOrReplaceTempView("movies")
ratings_df.createOrReplaceTempView("ratings")

# Perform SQL query to find the top 10 highest-rated movies with at least 10 ratings
query = """
    SELECT m.movieId, m.title, AVG(r.rating) AS avg_rating, COUNT(r.rating) AS rating_count
    FROM movies m
    JOIN ratings r ON m.movieId = r.movieId
    GROUP BY m.movieId, m.title
    HAVING COUNT(r.rating) >= 10
    ORDER BY avg_rating DESC
    LIMIT 10
"""

# Execute the query
top_10_movies = spark.sql(query)

# Show the result
top_10_movies.show()


root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

+-------+--------------------+-----------------+------------+
|movieId|               title|       avg_rating|rating_count|
+-------+--------------------+-----------------+------------+
|   1041|Secrets & Lies (1...|4.590909090909091|          11|
|   3451|Guess Who's Comin...|4.545454545454546|          11|
|   1178|Paths of Glory (1...|4.541666666666667|          12|
|   1104|Streetcar Named D...|            4.475|          20|
|   2360|Celebration, The ...|4.458333333333333|          12|
|   1217|          Ran (1985)|4.433333333333334|          15|
|    318|Shawshank Redempt...|4.429022082018927|         317|
|    951|His Girl Friday (...|4.392857142857143|          14|
|   1927|All Quiet on the ...|  