In [2]:
import findspark
findspark.init("/opt/spark")

from pyspark.sql import SparkSession
import os

spark = SparkSession.builder.appName("IMDB EDA").getOrCreate()
PROJECT_ROOT = os.path.abspath("..")
DATA_PATH = os.path.join(PROJECT_ROOT, "data_validation", "data_for_batch", "fact_movie_full.parquet")
df = spark.read.parquet(DATA_PATH)
df.printSchema()


                                                                                

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- averageRating: string (nullable = true)
 |-- numVotes: string (nullable = true)
 |-- directors: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- category: string (nullable = true)
 |-- primaryName: string (nullable = true)
 |-- primaryProfession: string (nullable = true)



In [4]:
df.show(5, truncate=False)
print(f"Số dòng: {df.count()}")
print(f"Số cột: {len(df.columns)}")

+----------+---------+-----------------------------+-----------------------------+---------+-------+--------------+-----------------+-------------+--------+---------+-------+--------+-----------+-----------------+
|tconst    |titleType|primaryTitle                 |originalTitle                |startYear|endYear|runtimeMinutes|genres           |averageRating|numVotes|directors|writers|category|primaryName|primaryProfession|
+----------+---------+-----------------------------+-----------------------------+---------+-------+--------------+-----------------+-------------+--------+---------+-------+--------+-----------+-----------------+
|tt0004347 |movie    |A Modern Mephisto            |A Modern Mephisto            |1914     |NULL   |NULL          |NULL             |NULL         |NULL    |NULL     |NULL   |NULL    |NULL       |NULL             |
|tt21654270|tvEpisode|Tabular Yikiliyor            |Tabular Yikiliyor            |2000     |NULL   |NULL          |Documentary      |NULL       



Số dòng: 94624158
Số cột: 15


                                                                                

In [5]:
from pyspark.sql.functions import col, sum

df.select([
    sum(col(c).isNull().cast("int")).alias(c + "_nulls")
    for c in df.columns
]).show()



+------------+---------------+------------------+-------------------+---------------+-------------+--------------------+------------+-------------------+--------------+---------------+-------------+--------------+-----------------+-----------------------+
|tconst_nulls|titleType_nulls|primaryTitle_nulls|originalTitle_nulls|startYear_nulls|endYear_nulls|runtimeMinutes_nulls|genres_nulls|averageRating_nulls|numVotes_nulls|directors_nulls|writers_nulls|category_nulls|primaryName_nulls|primaryProfession_nulls|
+------------+---------------+------------------+-------------------+---------------+-------------+--------------------+------------+-------------------+--------------+---------------+-------------+--------------+-----------------+-----------------------+
|           0|              0|                 0|                  0|        9412409|     93556048|            55201738|     3040197|           72651414|      72651414|       21018615|     24967043|       1122586|          1130602| 

                                                                                

In [6]:
from pyspark.sql.functions import col

df.select("averageRating").describe().show()

df.select("averageRating").groupBy("averageRating").count().orderBy("averageRating").show(50)


                                                                                

+-------+------------------+
|summary|     averageRating|
+-------+------------------+
|  count|          21972744|
|   mean|  6.94980985533769|
| stddev|1.3456689590406963|
|    min|               1.0|
|    max|               9.9|
+-------+------------------+





+-------------+--------+
|averageRating|   count|
+-------------+--------+
|         NULL|72651414|
|          1.0|   13146|
|          1.1|    3047|
|          1.2|    3810|
|          1.3|    4020|
|          1.4|    4762|
|          1.5|    5764|
|          1.6|    5373|
|          1.7|    5755|
|          1.8|    7271|
|          1.9|    7211|
|         10.0|   51001|
|          2.0|   10146|
|          2.1|    9268|
|          2.2|   13030|
|          2.3|   15033|
|          2.4|   16251|
|          2.5|   16604|
|          2.6|   19294|
|          2.7|   20473|
|          2.8|   25635|
|          2.9|   21374|
|          3.0|   31005|
|          3.1|   28638|
|          3.2|   39682|
|          3.3|   36667|
|          3.4|   45127|
|          3.5|   44380|
|          3.6|   55812|
|          3.7|   53298|
|          3.8|   63387|
|          3.9|   55969|
|          4.0|   75849|
|          4.1|   70899|
|          4.2|   92919|
|          4.3|   88525|
|          4.4|  107117|


                                                                                

In [16]:
df.groupBy("genres").count().orderBy("count", ascending=False).show(20, truncate=False)



+--------------------------+--------+
|genres                    |count   |
+--------------------------+--------+
|Drama                     |14503726|
|Comedy                    |7183900 |
|Drama,Romance             |6476093 |
|Talk-Show                 |3348062 |
|NULL                      |3040197 |
|Documentary               |2956820 |
|Reality-TV                |2037376 |
|Drama,Short               |1999023 |
|News                      |1931230 |
|Comedy,Drama,Romance      |1480047 |
|Comedy,Drama              |1401221 |
|Short                     |1391904 |
|Comedy,Short              |1371696 |
|Adult                     |1297615 |
|News,Talk-Show            |1275554 |
|Family                    |1213917 |
|Action,Adventure,Animation|1200868 |
|Romance                   |1171013 |
|Game-Show                 |947581  |
|Crime,Drama,Mystery       |925595  |
+--------------------------+--------+
only showing top 20 rows



                                                                                

In [17]:
df.groupBy("startYear").count().orderBy("startYear").show(100)




+---------+-------+
|startYear|  count|
+---------+-------+
|     NULL|9412409|
|     1874|      4|
|     1878|     38|
|     1881|     26|
|     1882|      6|
|     1883|      3|
|     1885|      2|
|     1887|    132|
|     1888|     19|
|     1889|      3|
|     1890|     29|
|     1891|     60|
|     1892|     28|
|     1893|     24|
|     1894|    402|
|     1895|    357|
|     1896|   1752|
|     1897|   2066|
|     1898|   2531|
|     1899|   2578|
|     1900|   2678|
|     1901|   2386|
|     1902|   2458|
|     1903|   3318|
|     1904|   2326|
|     1905|   2436|
|     1906|   2931|
|     1907|   3686|
|     1908|   7209|
|     1909|  10885|
|     1910|  14255|
|     1911|  20328|
|     1912|  30599|
|     1913|  40273|
|     1914|  45060|
|     1915|  46901|
|     1916|  42889|
|     1917|  36677|
|     1918|  29641|
|     1919|  29864|
|     1920|  32297|
|     1921|  30448|
|     1922|  25760|
|     1923|  24145|
|     1924|  25131|
|     1925|  27981|
|     1926|  29390|


                                                                                

In [19]:
spark.stop()