# Spark Mini-Exercises

In [4]:
import pandas as pd
import numpy as np
import pyspark

np.random.seed(13)

pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

1. Spark Dataframe Basics

    i. Use the starter code above to create a pandas dataframe.

    ii. Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.

In [6]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

df = spark.createDataFrame(pandas_dataframe)

iii. Show the first 3 rows of the dataframe.



In [7]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



iv. Show the first 7 rows of the dataframe.


In [8]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



v. View a summary of the data using .describe.

In [10]:
df.describe().show()


+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885217| null|
| stddev|0.8905322898155363| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



vi. Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.

In [11]:
df.select('n', 'abool').show(5)


+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



vii. Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.



In [12]:
df.select('group', 'abool').show(5)


+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



viii. Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.



In [13]:
df.select('group', df.abool.alias('a_boolean_value')).show(3)


+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



ix. Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.

In [15]:
from pyspark.sql.functions import col

df.select('group', col('n').alias('a_numeric_value')).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



2. Column Manipulation

    i. Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a varaible named df

    ii. Use .select to add 4 to the n column. Show the results.

In [16]:
df.select('n', df.n + 4).show()

+--------------------+------------------+
|                   n|           (n + 4)|
+--------------------+------------------+
|  -0.712390662050588|3.2876093379494122|
|   0.753766378659703| 4.753766378659703|
|-0.04450307833805...|3.9554969216619464|
| 0.45181233874578974|  4.45181233874579|
|  1.3451017084510097|5.3451017084510095|
|  0.5323378882945463| 4.532337888294546|
|  1.3501878997225267| 5.350187899722527|
|  0.8612113741693206|  4.86121137416932|
|  1.4786857374358966| 5.478685737435897|
| -1.0453771305385342| 2.954622869461466|
| -0.7889890249515489|3.2110109750484512|
|  -1.261605945319069| 2.738394054680931|
|  0.5628467852810314| 4.562846785281032|
|-0.24332625188556253|3.7566737481144377|
|  0.9137407048596775| 4.913740704859677|
| 0.31735092273633597| 4.317350922736336|
| 0.12730328020698067| 4.127303280206981|
|  2.1503829673811126| 6.150382967381113|
|  0.6062886568962988| 4.606288656896298|
|-0.02677164998644...|3.9732283500135592|
+--------------------+------------

iii. Subtract 5 from the n column and view the results.

In [17]:
df.select('n', df.n - 5).show()


+--------------------+-------------------+
|                   n|            (n - 5)|
+--------------------+-------------------+
|  -0.712390662050588| -5.712390662050588|
|   0.753766378659703| -4.246233621340297|
|-0.04450307833805...| -5.044503078338053|
| 0.45181233874578974|  -4.54818766125421|
|  1.3451017084510097|-3.6548982915489905|
|  0.5323378882945463| -4.467662111705454|
|  1.3501878997225267|-3.6498121002774733|
|  0.8612113741693206|  -4.13878862583068|
|  1.4786857374358966| -3.521314262564103|
| -1.0453771305385342| -6.045377130538534|
| -0.7889890249515489| -5.788989024951549|
|  -1.261605945319069| -6.261605945319069|
|  0.5628467852810314| -4.437153214718968|
|-0.24332625188556253| -5.243326251885563|
|  0.9137407048596775| -4.086259295140323|
| 0.31735092273633597| -4.682649077263664|
| 0.12730328020698067| -4.872696719793019|
|  2.1503829673811126|-2.8496170326188874|
|  0.6062886568962988| -4.393711343103702|
|-0.02677164998644...| -5.026771649986441|
+----------

iv. Multiply the n column by 2. View the results along with the original numbers.

In [18]:
df.select('n', df.n * 5).show()


+--------------------+--------------------+
|                   n|             (n * 5)|
+--------------------+--------------------+
|  -0.712390662050588| -3.5619533102529397|
|   0.753766378659703|  3.7688318932985148|
|-0.04450307833805...|-0.22251539169026727|
| 0.45181233874578974|   2.259061693728949|
|  1.3451017084510097|  6.7255085422550485|
|  0.5323378882945463|  2.6616894414727317|
|  1.3501878997225267|   6.750939498612634|
|  0.8612113741693206|   4.306056870846603|
|  1.4786857374358966|   7.393428687179483|
| -1.0453771305385342|  -5.226885652692671|
| -0.7889890249515489| -3.9449451247577443|
|  -1.261605945319069|  -6.308029726595345|
|  0.5628467852810314|   2.814233926405157|
|-0.24332625188556253| -1.2166312594278126|
|  0.9137407048596775|   4.568703524298387|
| 0.31735092273633597|    1.58675461368168|
| 0.12730328020698067|  0.6365164010349034|
|  2.1503829673811126|  10.751914836905563|
|  0.6062886568962988|  3.0314432844814942|
|-0.02677164998644...|-0.1338582

v. Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.

In [19]:
df = df.select('*', (df.n * -1).alias('n2'))
df.show(4)

+--------------------+-----+-----+--------------------+
|                   n|group|abool|                  n2|
+--------------------+-----+-----+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|
|   0.753766378659703|    x|false|  -0.753766378659703|
|-0.04450307833805...|    z|false|0.044503078338053455|
| 0.45181233874578974|    y|false|-0.45181233874578974|
+--------------------+-----+-----+--------------------+
only showing top 4 rows



vi. Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.

In [20]:
df = df.select('*', (df.n * df.n).alias('n3'))
df.show(5)

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|    x|false|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|    z|false|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|    y|false|-0.45181233874578974| 0.20413438944294027|
|  1.3451017084510097|    z|false| -1.3451017084510097|  1.8092986060778251|
+--------------------+-----+-----+--------------------+--------------------+
only showing top 5 rows



vii. What happens when you run the code below?

In [21]:
df.group + df.abool


Column<'(group + abool)'>

- A column is create that adds the contents of group and abool

viii. What happens when you run the code below? What is the difference between this and the previous code sample?



In [22]:
df.select(df.group + df.abool)

AnalysisException: cannot resolve '(CAST(group AS DOUBLE) + abool)' due to data type mismatch: differing types in '(CAST(group AS DOUBLE) + abool)' (double and boolean).;
'Project [unresolvedalias((cast(group#1 as double) + abool#2), Some(org.apache.spark.sql.Column$$Lambda$3175/0x0000000801364040@7b873e8e))]
+- Project [n#0, group#1, abool#2, n2#241, (n#0 * n#0) AS n3#263]
   +- Project [n#0, group#1, abool#2, (n#0 * cast(-1 as double)) AS n2#241]
      +- LogicalRDD [n#0, group#1, abool#2], false


- We get an error because it's trying to add to mismatched types

ix. Try adding various other columns together. What are the results of combining the different data types?



In [24]:
df.select(df.group + df.abool + df.n)

AnalysisException: cannot resolve '(CAST(group AS DOUBLE) + abool)' due to data type mismatch: differing types in '(CAST(group AS DOUBLE) + abool)' (double and boolean).;
'Project [unresolvedalias(((cast(group#1 as double) + abool#2) + n#0), Some(org.apache.spark.sql.Column$$Lambda$3175/0x0000000801364040@7b873e8e))]
+- Project [n#0, group#1, abool#2, n2#241, (n#0 * n#0) AS n3#263]
   +- Project [n#0, group#1, abool#2, (n#0 * cast(-1 as double)) AS n2#241]
      +- LogicalRDD [n#0, group#1, abool#2], false


- We get the same error as before due to mismatched data types

3. Type casting

   i. Use the starter code above to re-create a spark dataframe.
   
   ii. Use .printSchema to view the datatypes in your dataframe.

In [25]:
df.printSchema()


root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)
 |-- n2: double (nullable = true)
 |-- n3: double (nullable = true)



iii. Use .dtypes to view the datatypes in your dataframe.



In [26]:
df.dtypes

[('n', 'double'),
 ('group', 'string'),
 ('abool', 'boolean'),
 ('n2', 'double'),
 ('n3', 'double')]

iv. What is the difference between the two code samples below?



In [27]:
df.abool.cast('int')

Column<'CAST(abool AS INT)'>

In [28]:
df.select(df.abool.cast('int')).show()


+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    0|
|    1|
|    0|
|    1|
+-----+



- One creates a Column and one is using that same column in a .select in order to view the results of the cast.



v. Use .select and .cast to convert the abool column to an integer type. View the results.



In [29]:
df.select('abool', df.abool.cast('int')).show()


+-----+-----+
|abool|abool|
+-----+-----+
|false|    0|
|false|    0|
|false|    0|
|false|    0|
|false|    0|
|false|    0|
|false|    0|
|false|    0|
| true|    1|
| true|    1|
|false|    0|
|false|    0|
| true|    1|
| true|    1|
|false|    0|
|false|    0|
|false|    0|
| true|    1|
|false|    0|
| true|    1|
+-----+-----+



vi. Convert the group column to a integer data type and view the results. What happens?



In [30]:
df.select('group', df.group.cast('int')).show()


+-----+-----+
|group|group|
+-----+-----+
|    z| null|
|    x| null|
|    z| null|
|    y| null|
|    z| null|
|    y| null|
|    z| null|
|    x| null|
|    z| null|
|    y| null|
|    x| null|
|    y| null|
|    y| null|
|    y| null|
|    y| null|
|    x| null|
|    z| null|
|    y| null|
|    x| null|
|    x| null|
+-----+-----+



- The values are converted to nulls

vii. Convert the n column to a integer data type and view the results. What happens?



In [32]:
df.select('n', df.n.cast('int')).show()


+--------------------+---+
|                   n|  n|
+--------------------+---+
|  -0.712390662050588|  0|
|   0.753766378659703|  0|
|-0.04450307833805...|  0|
| 0.45181233874578974|  0|
|  1.3451017084510097|  1|
|  0.5323378882945463|  0|
|  1.3501878997225267|  1|
|  0.8612113741693206|  0|
|  1.4786857374358966|  1|
| -1.0453771305385342| -1|
| -0.7889890249515489|  0|
|  -1.261605945319069| -1|
|  0.5628467852810314|  0|
|-0.24332625188556253|  0|
|  0.9137407048596775|  0|
| 0.31735092273633597|  0|
| 0.12730328020698067|  0|
|  2.1503829673811126|  2|
|  0.6062886568962988|  0|
|-0.02677164998644...|  0|
+--------------------+---+



- It truncates to the nearest integer

viii. Convert the abool column to a string data type and view the results. What happens?



In [33]:
df.select('abool', df.abool.cast('string')).show()


+-----+-----+
|abool|abool|
+-----+-----+
|false|false|
|false|false|
|false|false|
|false|false|
|false|false|
|false|false|
|false|false|
|false|false|
| true| true|
| true| true|
|false|false|
|false|false|
| true| true|
| true| true|
|false|false|
|false|false|
|false|false|
| true| true|
|false|false|
| true| true|
+-----+-----+



- It appears the same but it changes the type to string.

4. Built-in Functions

     i. Use the starter code above to re-create a spark dataframe.

     ii. Import the necessary functions from pyspark.sql.functions

In [34]:
from pyspark.sql.functions import *


   i. Find the highest n value.

   ii. Find the lowest n value.

   iii. Find the average n value.

In [35]:
df.select(max('n'), min('n'), mean('n')).show()

+------------------+------------------+------------------+
|            max(n)|            min(n)|            avg(n)|
+------------------+------------------+------------------+
|2.1503829673811126|-1.261605945319069|0.3664026449885217|
+------------------+------------------+------------------+



vi. Use concat to change the group column to say, e.g. "Group: x" or "Group: y"



In [37]:
df.select(concat(lit('Group: '), 'group'))


DataFrame[concat(Group: , group): string]

vii. Use concat to combine the n and group columns to produce results that look like this: "x: -1.432" or "z: 2.352"



In [38]:
df.select(concat('group', lit(': '), 'n')).show()


+--------------------+
|concat(group, : , n)|
+--------------------+
|z: -0.71239066205...|
|x: 0.753766378659703|
|z: -0.04450307833...|
|y: 0.451812338745...|
|z: 1.345101708451...|
|y: 0.532337888294...|
|z: 1.350187899722...|
|x: 0.861211374169...|
|z: 1.478685737435...|
|y: -1.04537713053...|
|x: -0.78898902495...|
|y: -1.26160594531...|
|y: 0.562846785281...|
|y: -0.24332625188...|
|y: 0.913740704859...|
|x: 0.317350922736...|
|z: 0.127303280206...|
|y: 2.150382967381...|
|x: 0.606288656896...|
|x: -0.02677164998...|
+--------------------+



5. When / Otherwise

    i. Use the starter code above to re-create a spark dataframe.

    ii. Use when and .otherwise to create a column that contains the text "It is true" when abool is true and "It is false"" when abool is false.

In [39]:
df.select(when(df.abool, 'It is true').otherwise('It is false')).show()


+----------------------------------------------------+
|CASE WHEN abool THEN It is true ELSE It is false END|
+----------------------------------------------------+
|                                         It is false|
|                                         It is false|
|                                         It is false|
|                                         It is false|
|                                         It is false|
|                                         It is false|
|                                         It is false|
|                                         It is false|
|                                          It is true|
|                                          It is true|
|                                         It is false|
|                                         It is false|
|                                          It is true|
|                                          It is true|
|                                         It is false|
|         

   iii. Create a column that contains 0 if n is less than 0, otherwise, the original n value.



In [40]:
df.select('n', when(df.n < 0, 0).otherwise(df.n)).show()



+--------------------+-----------------------------------+
|                   n|CASE WHEN (n < 0) THEN 0 ELSE n END|
+--------------------+-----------------------------------+
|  -0.712390662050588|                                0.0|
|   0.753766378659703|                  0.753766378659703|
|-0.04450307833805...|                                0.0|
| 0.45181233874578974|                0.45181233874578974|
|  1.3451017084510097|                 1.3451017084510097|
|  0.5323378882945463|                 0.5323378882945463|
|  1.3501878997225267|                 1.3501878997225267|
|  0.8612113741693206|                 0.8612113741693206|
|  1.4786857374358966|                 1.4786857374358966|
| -1.0453771305385342|                                0.0|
| -0.7889890249515489|                                0.0|
|  -1.261605945319069|                                0.0|
|  0.5628467852810314|                 0.5628467852810314|
|-0.24332625188556253|                                0.

6. Filter / Where

    i. Use the starter code above to re-create a spark dataframe.
    
    ii. Use .filter or .where to select just the rows where the group is y and view the results.

In [41]:
df.filter(df.group == 'y').show()


+--------------------+-----+-----+--------------------+-------------------+
|                   n|group|abool|                  n2|                 n3|
+--------------------+-----+-----+--------------------+-------------------+
| 0.45181233874578974|    y|false|-0.45181233874578974|0.20413438944294027|
|  0.5323378882945463|    y|false| -0.5323378882945463| 0.2833836273138969|
| -1.0453771305385342|    y| true|  1.0453771305385342| 1.0928133450529796|
|  -1.261605945319069|    y|false|   1.261605945319069|  1.591649561264422|
|  0.5628467852810314|    y| true| -0.5628467852810314|0.31679650370119145|
|-0.24332625188556253|    y| true| 0.24332625188556253|0.05920766485667622|
|  0.9137407048596775|    y|false| -0.9137407048596775| 0.8349220757174602|
|  2.1503829673811126|    y| true| -2.1503829673811126|  4.624146906402799|
+--------------------+-----+-----+--------------------+-------------------+



iii. Select just the columns where the abool column is false and view the results.



In [42]:
df.filter(df.abool == False).show()


+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|    x|false|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|    z|false|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|    y|false|-0.45181233874578974| 0.20413438944294027|
|  1.3451017084510097|    z|false| -1.3451017084510097|  1.8092986060778251|
|  0.5323378882945463|    y|false| -0.5323378882945463|  0.2833836273138969|
|  1.3501878997225267|    z|false| -1.3501878997225267|  1.8230073645571279|
|  0.8612113741693206|    x|false| -0.8612113741693206|  0.7416850309986095|
| -0.7889890249515489|    x|false|  0.7889890249515489|  0.6225036814939958|
|  -1.261605945319069|    y|false|   1.261605945319069|   1.591649561264422|

iv. Find the columns where the group column is not y.

In [43]:
df.filter(df.group != 'y').show()


+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|    x|false|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|    z|false|0.044503078338053455|0.001980523981562...|
|  1.3451017084510097|    z|false| -1.3451017084510097|  1.8092986060778251|
|  1.3501878997225267|    z|false| -1.3501878997225267|  1.8230073645571279|
|  0.8612113741693206|    x|false| -0.8612113741693206|  0.7416850309986095|
|  1.4786857374358966|    z| true| -1.4786857374358966|  2.1865115100963415|
| -0.7889890249515489|    x|false|  0.7889890249515489|  0.6225036814939958|
| 0.31735092273633597|    x|false|-0.31735092273633597| 0.10071160816160388|
| 0.12730328020698067|    z|false|-0.12730328020698067|0.016206125151457036|

v. Find the columns where n is positive.

In [44]:
df.filter(df.n > 0).show()


+-------------------+-----+-----+--------------------+--------------------+
|                  n|group|abool|                  n2|                  n3|
+-------------------+-----+-----+--------------------+--------------------+
|  0.753766378659703|    x|false|  -0.753766378659703|  0.5681637535977627|
|0.45181233874578974|    y|false|-0.45181233874578974| 0.20413438944294027|
| 1.3451017084510097|    z|false| -1.3451017084510097|  1.8092986060778251|
| 0.5323378882945463|    y|false| -0.5323378882945463|  0.2833836273138969|
| 1.3501878997225267|    z|false| -1.3501878997225267|  1.8230073645571279|
| 0.8612113741693206|    x|false| -0.8612113741693206|  0.7416850309986095|
| 1.4786857374358966|    z| true| -1.4786857374358966|  2.1865115100963415|
| 0.5628467852810314|    y| true| -0.5628467852810314| 0.31679650370119145|
| 0.9137407048596775|    y|false| -0.9137407048596775|  0.8349220757174602|
|0.31735092273633597|    x|false|-0.31735092273633597| 0.10071160816160388|
|0.127303280

vi. Find the columns where abool is true and the group column is z.

In [45]:
df.filter(df.abool & (df.group == 'z')).show()


+------------------+-----+-----+-------------------+------------------+
|                 n|group|abool|                 n2|                n3|
+------------------+-----+-----+-------------------+------------------+
|1.4786857374358966|    z| true|-1.4786857374358966|2.1865115100963415|
+------------------+-----+-----+-------------------+------------------+



vii. Find the columns where abool is true or the group column is z.

In [46]:
df.filter(df.abool | (df.group == 'z')).show()


+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|   0.507500455376875|
|-0.04450307833805...|    z|false|0.044503078338053455|0.001980523981562...|
|  1.3451017084510097|    z|false| -1.3451017084510097|  1.8092986060778251|
|  1.3501878997225267|    z|false| -1.3501878997225267|  1.8230073645571279|
|  1.4786857374358966|    z| true| -1.4786857374358966|  2.1865115100963415|
| -1.0453771305385342|    y| true|  1.0453771305385342|  1.0928133450529796|
|  0.5628467852810314|    y| true| -0.5628467852810314| 0.31679650370119145|
|-0.24332625188556253|    y| true| 0.24332625188556253| 0.05920766485667622|
| 0.12730328020698067|    z|false|-0.12730328020698067|0.016206125151457036|
|  2.1503829673811126|    y| true| -2.1503829673811126|   4.624146906402799|

viii. Find the columns where abool is false and n is less than 1

In [48]:
df.filter((df.abool==False) & (df.n < 1)).show()


+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|    x|false|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|    z|false|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|    y|false|-0.45181233874578974| 0.20413438944294027|
|  0.5323378882945463|    y|false| -0.5323378882945463|  0.2833836273138969|
|  0.8612113741693206|    x|false| -0.8612113741693206|  0.7416850309986095|
| -0.7889890249515489|    x|false|  0.7889890249515489|  0.6225036814939958|
|  -1.261605945319069|    y|false|   1.261605945319069|   1.591649561264422|
|  0.9137407048596775|    y|false| -0.9137407048596775|  0.8349220757174602|
| 0.31735092273633597|    x|false|-0.31735092273633597| 0.10071160816160388|

ix. Find the columns where abool is false or n is less than 1

In [49]:
df.filter((df.abool==False) | (df.n < 1)).show()

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|    x|false|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|    z|false|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|    y|false|-0.45181233874578974| 0.20413438944294027|
|  1.3451017084510097|    z|false| -1.3451017084510097|  1.8092986060778251|
|  0.5323378882945463|    y|false| -0.5323378882945463|  0.2833836273138969|
|  1.3501878997225267|    z|false| -1.3501878997225267|  1.8230073645571279|
|  0.8612113741693206|    x|false| -0.8612113741693206|  0.7416850309986095|
| -1.0453771305385342|    y| true|  1.0453771305385342|  1.0928133450529796|
| -0.7889890249515489|    x|false|  0.7889890249515489|  0.6225036814939958|