In [7]:
# Section must be included at the beginning of each new notebook. Remember to change the app name. 
# If you're using VirtualBox, change the below to '/home/user/spark-2.1.1-bin-hadoop2.7'
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('basics').getOrCreate()

In [8]:
# Let's read in the data. If you open the dataset, you'll find that each column has a header. We specify that by stating that header=True.
# To make our lives easier, we can also use 'inferSchema' when importing CSVs. This automatically detects data types.
# If you would like to manually change data types, refer to this article: https://medium.com/@mrpowers/adding-structtype-columns-to-spark-dataframes-b44125409803
df = spark.read.csv('absenteeism3.csv',header=True,inferSchema=True)

In [9]:
# The show method allows you visualise DataFrames in a tabular format. 
df.show()

+---+------------------+----------------+---------------+-------+----------------------+-------------------------------+------------+---+----------------------+----------+--------------------+---------+---+--------------+-------------+---+------+---------------+-------------------------+------+
| ID|Reason for absence|Month of absence|Day of the week|Seasons|Transportation expense|Distance from Residence to Work|Service time|Age|Work load Average/day |Hit target|Disciplinary failure|Education|Son|Social drinker|Social smoker|Pet|Weight|Body mass index|Absenteeism time in hours|Height|
+---+------------------+----------------+---------------+-------+----------------------+-------------------------------+------------+---+----------------------+----------+--------------------+---------+---+--------------+-------------+---+------+---------------+-------------------------+------+
| 11|                26|               7|              3|      1|                   289|                        

In [10]:
# Print schema allows us to visualise the data structure at a high level. 
df.printSchema()

# We can also use head to print a specific amount of rows, so we can get a better understanding of the data points. 
# Note that we have to specify 'print' depending on the method we're using. Otherwise it may not show up!
print(df.head(1))

root
 |-- ID: integer (nullable = true)
 |-- Reason for absence: integer (nullable = true)
 |-- Month of absence: integer (nullable = true)
 |-- Day of the week: integer (nullable = true)
 |-- Seasons: integer (nullable = true)
 |-- Transportation expense: integer (nullable = true)
 |-- Distance from Residence to Work: integer (nullable = true)
 |-- Service time: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Work load Average/day : string (nullable = true)
 |-- Hit target: integer (nullable = true)
 |-- Disciplinary failure: integer (nullable = true)
 |-- Education: integer (nullable = true)
 |-- Son: integer (nullable = true)
 |-- Social drinker: integer (nullable = true)
 |-- Social smoker: integer (nullable = true)
 |-- Pet: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Body mass index: integer (nullable = true)
 |-- Absenteeism time in hours: integer (nullable = true)
 |-- Height: double (nullable = true)

[Row(ID=11, Reason for absence=

In [11]:
# We can use the describe method get some general statistics on our data too. 
df.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+----------------------+-------------------------------+------------------+-----------------+----------------------+-----------------+--------------------+------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+-------------------------+--------------------+
|summary|                ID|Reason for absence|  Month of absence|   Day of the week|           Seasons|Transportation expense|Distance from Residence to Work|      Service time|              Age|Work load Average/day |       Hit target|Disciplinary failure|         Education|               Son|     Social drinker|      Social smoker|               Pet|            Weight|   Body mass index|Absenteeism time in hours|              Height|
+-------+------------------+------------------+------------------+------------------+------------------+--------------

In [13]:
# Let's select the columns that are integers, and use the describe method again.
# We see that the average age is 41. The average bank account balance is $1,074. 
# And they spoke to call centre reps for approx. 931 seconds on average. 
df.select('ID', 'Reason for absence', 'age','Education', 'Son', 'Social drinker', 'Social smoker', 'Pet', 'Weight', 'Body mass index', 'Absenteeism time in hours', 'Height').describe().show()

+-------+------------------+------------------+-----------------+------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+-------------------------+--------------------+
|summary|                ID|Reason for absence|              age|         Education|               Son|     Social drinker|      Social smoker|               Pet|            Weight|   Body mass index|Absenteeism time in hours|              Height|
+-------+------------------+------------------+-----------------+------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+-------------------------+--------------------+
|  count|               740|               740|              740|               740|               740|                740|                740|               740|               740|               740|                      740|                 740|
|   mean

In [14]:
# Let's select the balance column and assign it to a variable. 
bal_col = df.select('ID', 'Reason for absence', 'age','Education', 'Son', 'Social drinker', 'Social smoker', 'Pet', 'Weight', 'Body mass index', 'Absenteeism time in hours', 'Height')

# We can then use the show method on that variable.
bal_col.show()

+---+------------------+---+---------+---+--------------+-------------+---+------+---------------+-------------------------+------+
| ID|Reason for absence|age|Education|Son|Social drinker|Social smoker|Pet|Weight|Body mass index|Absenteeism time in hours|Height|
+---+------------------+---+---------+---+--------------+-------------+---+------+---------------+-------------------------+------+
| 11|                26| 33|        1|  2|             1|            0|  1|    90|             30|                        4|  1.72|
| 36|                 0| 50|        1|  1|             1|            0|  0|    98|             31|                        0|  1.78|
|  3|                23| 38|        1|  0|             1|            0|  0|    89|             31|                        2|   1.7|
|  7|                 7| 39|        1|  2|             1|            1|  0|    68|             24|                        4|  1.68|
| 11|                23| 33|        1|  2|             1|            0|  1| 

In [24]:
# Let's try out some additional DataFrame methods.
# How would we identify individuals with a balance above $5,000? Using filter! 
df.filter("ID=1").show(bal_col.show())

# We can also use more advanced filters. For example, let's see the jobs of people with over $2,500 in their bank account.
df.filter("ID=1").select('Reason for absence', 'Absenteeism time in hours').show(bal_col.show())

+---+------------------+---+---------+---+--------------+-------------+---+------+---------------+-------------------------+------+
| ID|Reason for absence|age|Education|Son|Social drinker|Social smoker|Pet|Weight|Body mass index|Absenteeism time in hours|Height|
+---+------------------+---+---------+---+--------------+-------------+---+------+---------------+-------------------------+------+
| 11|                26| 33|        1|  2|             1|            0|  1|    90|             30|                        4|  1.72|
| 36|                 0| 50|        1|  1|             1|            0|  0|    98|             31|                        0|  1.78|
|  3|                23| 38|        1|  0|             1|            0|  0|    89|             31|                        2|   1.7|
|  7|                 7| 39|        1|  2|             1|            1|  0|    68|             24|                        4|  1.68|
| 11|                23| 33|        1|  2|             1|            0|  1| 

Py4JError: An error occurred while calling o210.showString. Trace:
py4j.Py4JException: Method showString([null, class java.lang.Integer]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:272)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)

