In [53]:
from pyspark.sql import SparkSession

In [54]:
from pyspark.sql.functions import col

In [65]:
from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, StructField

In [55]:
spark = SparkSession.builder \
    .appName("Book Analysis") \
    .getOrCreate()

In [104]:
custom_schema = StructType([
    StructField("_co", IntegerType(), True),
    StructField("Title", StringType(), True, {"escape": ","}),
    StructField("description", StringType(), True, {"escape": "\"\""}),
    StructField("authors", StringType(), True, {"escape": ","}),
    StructField("publishers", StringType(), True, {"escape": ","}),
    StructField("publishedDate", StringType(), True, {"escape": ","}),
    StructField("categories", StringType(), True, {"escape": ","}),
    StructField("Impact", FloatType(), True)
    # Add more fields as needed
])

In [98]:
df = spark.read.csv("books_task.csv", header=True, schema=custom_schema)

In [111]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- publishedDate: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- Impact: string (nullable = true)



In [112]:
df.filter(df["_c0"] == 2521).show()

24/04/21 09:41:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Title, description, authors, publisher, publishedDate, categories, Impact
 Schema: _c0, Title, description, authors, publisher, publishedDate, categories, Impact
Expected: _c0 but found: 
CSV file: file:///home/tanmay/Documents/highlevel/books_task.csv


+----+-------------+--------------------+--------------------+--------------------+-------------+----------+-----------------+
| _c0|        Title|         description|             authors|           publisher|publishedDate|categories|           Impact|
+----+-------------+--------------------+--------------------+--------------------+-------------+----------+-----------------+
|2521|Tuneful tales|As enigmatic and ...|['Bernice Love Wi...|Texas Tech Univer...|         2002|['Poetry']|805.5685400829483|
+----+-------------+--------------------+--------------------+--------------------+-------------+----------+-----------------+



In [109]:
df = spark.read.option("header", "true").option("delimiter", ",").option("quote", "\"").option("escape", "\"").csv("books_task.csv")

In [76]:
df.select('authors').distinct().show()

+--------------------+
|             authors|
+--------------------+
|    ['Donald Cline']|
|     ['Dian Layton']|
|       ['Kotoyama,']|
|   ['Joseph Kerman']|
|     ['Kay Flowers']|
|"" To a very stro...|
|['I. Ristic', 'Ia...|
|['Andrew P. Tobias']|
|['Judith Ennamora...|
|['Jamgon Kongtrul...|
|['Frank Miller', ...|
|['National Resear...|
| all of whom were...|
|      ['Max Fogiel']|
|      ['Jules Bass']|
|  ['Rebecca Harvin']|
|  ['Sylvia Abraham']|
|   ['Jacques Ellul']|
|    ['Kevin Foster']|
|['Ana Jarvis', 'R...|
+--------------------+
only showing top 20 rows



In [57]:
df = spark.read.csv("books_task.csv", sep=',', inferSchema=True, quote = '"', header=True)

In [58]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- publishedDate: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- Impact: string (nullable = true)



In [59]:
df.show()

+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|               Title|         description|             authors|           publisher|       publishedDate|          categories|              Impact|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|Its Only Art If I...|                null|    ['Julie Strain']|Smithsonian Insti...|                1996|['Comics & Graphi...|   784.3039243054303|
|  1|Dr. Seuss: Americ...|"Philip Nel takes...| like that of Lew...| has changed lang...| giving us new wo...| inspiring artist...|      ['Philip Nel']|
|  2|Wonderful Worship...|This resource inc...|    ['David R. Ray']|             OUP USA|                2000|        ['Religion']|   841.7053210126119|
|  3|Whispers of the W...|Julia Thomas find...| ['Veronica Haddon']|           iUn

24/04/21 09:22:49 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Title, description, authors, publisher, publishedDate, categories, Impact
 Schema: _c0, Title, description, authors, publisher, publishedDate, categories, Impact
Expected: _c0 but found: 
CSV file: file:///home/tanmay/Documents/highlevel/books_task.csv


In [60]:
df.count()

138724

In [61]:
df.select('Impact').distinct().show()

+--------------------+
|              Impact|
+--------------------+
|   743.7950449946484|
|   839.3731438824466|
|   718.3721985226707|
|   906.2100605165298|
|   672.2811823572971|
| diagrammatic gui...|
|   832.7099694825474|
|          2019-08-22|
|   810.7406296343943|
|             Tolstoy|
| story board exce...|
|        spirituality|
|   802.3051376732733|
| and shows the tr...|
| the ""disastrous...|
|   869.4545794266415|
| and the radical ...|
| Mary Madigan sad...|
|   843.1325776250586|
|   552.7999873000787|
+--------------------+
only showing top 20 rows



In [12]:
df.show(1,0)

+---+------------------------------+-----------+----------------+-----------------------+-------------+---------------------------+-----------------+
|_c0|Title                         |description|authors         |publisher              |publishedDate|categories                 |Impact           |
+---+------------------------------+-----------+----------------+-----------------------+-------------+---------------------------+-----------------+
|0  |Its Only Art If Its Well Hung!|null       |['Julie Strain']|Smithsonian Institution|1996         |['Comics & Graphic Novels']|784.3039243054303|
+---+------------------------------+-----------+----------------+-----------------------+-------------+---------------------------+-----------------+
only showing top 1 row



24/04/21 08:59:19 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Title, description, authors, publisher, publishedDate, categories, Impact
 Schema: _c0, Title, description, authors, publisher, publishedDate, categories, Impact
Expected: _c0 but found: 
CSV file: file:///home/tanmay/Documents/highlevel/books_task.csv


In [21]:
numeric_check_df = df.filter(col("Impact").cast("float").isNotNull())

In [24]:
numeric_check_df.count()

120484

In [29]:
numeric_check_df = df.filter(col("publishedDate").cast("float").isNull())

In [30]:
numeric_check_df.count()

89856

In [31]:
numeric_check_df.select('publishedDate').distinct().show()

+--------------------+
|       publishedDate|
+--------------------+
|          2007-03-06|
|          2016-08-17|
|          1994-03-01|
|          2003-11-06|
|          1999-11-18|
|          1992-07-31|
|   'Mark S. Freed']"|
|          2013-03-14|
| it is written fo...|
|             Inc."""|
|          2015-05-01|
|    biblical imagery|
|             1999-10|
| bioentrepreneurs...|
|          2014-05-27|
|          2009-06-23|
|          2008-11-19|
| a woman who was ...|
| a place where yo...|
|          2008-12-03|
+--------------------+
only showing top 20 rows



In [33]:
regex_pattern = "^\d{4}-\d{2}-\d{2}$"

In [35]:
regex_check_df = df.filter(col("publishedDate").rlike(regex_pattern))

In [49]:
df.select('categories').distinct().toPandas().to_csv("unique_category.csv")

In [45]:
import pandas as pd

In [46]:
data = pd.read_csv("unique_publisher.csv")

In [48]:
open(unique_publisher.csv)

NameError: name 'unique_publisher' is not defined

In [50]:
def get_string_length(x):
    return len(x)

In [51]:
spark.udf.register("string_length_udf", get_string_length)

<function __main__.get_string_length(x)>

In [None]:
df_length = df.withColumn("", spark.sql("double_age_udf(Age)"))