In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('jvt').getOrCreate()

data = ["Project",
"Gutenberg’s",
"Alice’s",
"Adventures",
"in",
"Wonderland",
"Project",
"Gutenberg’s",
"Adventures",
"in",
"Wonderland",
"Project",
"Gutenberg’s"]


In [2]:
rdd=spark.sparkContext.parallelize(data)

rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:262

In [3]:
rdd2=rdd.map(lambda x: (x,1))

print(rdd2)


PythonRDD[1] at RDD at PythonRDD.scala:53


In [4]:
for element in rdd2.collect():
    print(element)

('Project', 1)
('Gutenberg’s', 1)
('Alice’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('Project', 1)
('Gutenberg’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('Project', 1)
('Gutenberg’s', 1)


In [95]:
 # map() transformation to apply the lambda function, when you wanted to apply the custom transformation,
# you need to convert the DataFrame to RDD and apply the map() transformation

In [90]:
data = ["Project Gutenberg’s",
        "Alice’s Adventures in Wonderland",
        "Project Gutenberg’s",
        "Adventures in Wonderland",
        "Project Gutenberg’s"]

rdd=spark.sparkContext.parallelize(data)

for element in rdd.collect():
    print(element)

Project Gutenberg’s
Alice’s Adventures in Wonderland
Project Gutenberg’s
Adventures in Wonderland
Project Gutenberg’s


In [91]:
rdd2=rdd.flatMap(lambda x: x.split(" "))

#  first, it splits each record by space in an RDD and finally flattens it. 
# Resulting RDD consists of a single word on each record

for element in rdd2.collect():
    print(element)

Project
Gutenberg’s
Alice’s
Adventures
in
Wonderland
Project
Gutenberg’s
Adventures
in
Wonderland
Project
Gutenberg’s


In [6]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('jvt').getOrCreate()

arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})]

df = spark.createDataFrame(data=arrayData, schema = ['name','knownLanguages','properties'])

df.show()


+----------+--------------+--------------------+
|      name|knownLanguages|          properties|
+----------+--------------+--------------------+
|     James| [Java, Scala]|[eye -> brown, ha...|
|   Michael|[Spark, Java,]|[eye ->, hair -> ...|
|    Robert|    [CSharp, ]|[eye -> , hair ->...|
|Washington|          null|                null|
| Jefferson|        [1, 2]|                  []|
+----------+--------------+--------------------+



In [7]:
from pyspark.sql.functions import explode

df2 = df.select(df.name,explode(df.knownLanguages))

df2.printSchema()

df2.show()

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)

+---------+------+
|     name|   col|
+---------+------+
|    James|  Java|
|    James| Scala|
|  Michael| Spark|
|  Michael|  Java|
|  Michael|  null|
|   Robert|CSharp|
|   Robert|      |
|Jefferson|     1|
|Jefferson|     2|
+---------+------+



In [8]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('jvt').getOrCreate()

simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000) \
  ]

columns= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
+-------------+----------+-----+------+---+-----+



In [9]:
simpleData2 = [("James","Sales","NY",90000,34,10000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns2= ["employee_name","department","state","salary","age","bonus"]

df2 = spark.createDataFrame(data = simpleData2, schema = columns2)

df2.printSchema()
df2.show(truncate=False)




root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [79]:
# DataFrame union() method merges two DataFrames and returns the new DataFrame with all rows
# from two Dataframes regardless of duplicate data.

unionDF = df.union(df2)
unionDF.show(truncate=False)

#Merge without Duplicates

disDF = df.union(df2).distinct()
disDF.show(truncate=False)

unionAllDF = df.unionAll(df2)
unionAllDF.show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Michael      |Sales     |NY   |86000 |56 |20000|

In [11]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, asc,desc

spark = SparkSession.builder.appName('jvt').getOrCreate()

simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Raman","Finance","CA",99000,40,24000), \
    ("Scott","Finance","NY",83000,36,19000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]

columns= ["employee_name","department","state","salary","age","bonus"]


In [12]:
df = spark.createDataFrame(data = simpleData, schema = columns)

df.printSchema()
df.show(truncate=False)

#sort() function to sort on one or more columns. By default, it sorts by ascending order.

df.sort("department","state").show(truncate=False)
df.sort(col("department"),col("state")).show(truncate=False)


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+

In [13]:

#orderBy() function to sort on one or more columns. By default, it orders by ascending.

df.orderBy("department","state").show(truncate=False)
df.orderBy(col("department"),col("state")).show(truncate=False)



+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Raman        |Finance   |CA   |99000 |40 |24000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
+-------------+----------+-----+------+---+-----+

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Raman        |Finance   |CA   |99000 |40 |24000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|

In [None]:
df.sort(df.department.asc(),df.state.asc()).show(truncate=False)
df.sort(col("department").asc(),col("state").asc()).show(truncate=False)
df.orderBy(col("department").asc(),col("state").asc()).show(truncate=False)

df.sort(df.department.asc(),df.state.desc()).show(truncate=False)
df.sort(col("department").asc(),col("state").desc()).show(truncate=False)
df.orderBy(col("department").asc(),col("state").desc()).show(truncate=False)

#how to sort DataFrame using raw SQL syntax



In [14]:
df.createOrReplaceTempView("EMP")
spark.sql("select employee_name,department,state,salary,age,bonus from EMP ORDER BY department asc").show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
+-------------+----------+-----+------+---+-----+



In [1]:
# Load the data to the disk

# Import the data into the machine's memory

# Process/analyze the data

# Build the machine learning model

# Store the prediction back to disk

In [62]:
# https://spark.apache.org/docs/2.2.0/api/python/pyspark.sql.html#pyspark.sql.types.StructType

In [2]:
# Pyspark gives the data scientist an API that can be used to solve the parallel data proceedin problems. 

# Pyspark handles the complexities of multiprocessing,

# such as distributing the data, distributing code and collecting output 

# from the workers on a cluster of machines.

In [3]:
# Spark can run standalone but most often runs on top of a cluster computing framework such as Hadoop. 

In [4]:
# Spark works closely with SQL language, i.e., structured data. It allows querying the data in real time.

In [5]:
# Data scientist main's job is to analyze and build predictive models. 
# In short, a data scientist needs to know how to query data using SQL,

In [1]:
import pyspark
from pyspark import SparkContext
sc =SparkContext()

In [2]:
# Now that the SparkContext is ready, you can create a collection of data called RDD, Resilient Distributed Dataset.
# Computation in an RDD is automatically parallelized across the cluster.

In [3]:
nums= sc.parallelize([1,2,3,4])	

In [4]:
nums

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:262

In [5]:
nums.take(1) 

[1]

In [6]:
squared = nums.map(lambda x: x*x).collect()

print(squared)

for num in squared:
    print('%i ' % (num))
    

[1, 4, 9, 16]
1 
4 
9 
16 


In [7]:
from pyspark.sql import Row

from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

In [8]:
# SQLContext allows connecting the engine with different data sources. It is used to initiate the functionalities of Spark SQL.

In [9]:
list_p = [('John',19),('Smith',29),('Adam',35),('Henry',50)]

In [10]:
rdd = sc.parallelize(list_p)	

In [11]:
rdd

ParallelCollectionRDD[3] at readRDDFromFile at PythonRDD.scala:262

In [12]:
rdd.map(lambda x: Row(name=x[0], age=int(x[1])))	

PythonRDD[4] at RDD at PythonRDD.scala:53

In [13]:
list_p = [('John',19),('Smith',29),('Adam',35),('Henry',50)]

rdd = sc.parallelize(list_p)

print(rdd)

ppl = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))

print(ppl)

DF_ppl = sqlContext.createDataFrame(ppl)


ParallelCollectionRDD[5] at readRDDFromFile at PythonRDD.scala:262
PythonRDD[6] at RDD at PythonRDD.scala:53


In [16]:
DF_ppl.show()

DF_ppl.printSchema()

+-----+---+
| name|age|
+-----+---+
| John| 19|
|Smith| 29|
| Adam| 35|
|Henry| 50|
+-----+---+

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [17]:
DF_ppl.show()

+-----+---+
| name|age|
+-----+---+
| John| 19|
|Smith| 29|
| Adam| 35|
|Henry| 50|
+-----+---+



In [None]:
Step 1) Basic operation with PySpark
Step 2) Data preprocessing
Step 3) Build a data processing pipeline
Step 4) Build the classifier
Step 5) Train and evaluate the model
Step 6) Tune the hyperparameter

In [27]:

df = sqlContext.read.csv(SparkFiles.get("C:/Users/jvt/PYSPARKTRAINING/adult-data.csv"), header=True, inferSchema= True)


In [28]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: integer (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- wrk_hrs_per_week: integer (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)



In [29]:
df.show(5, truncate = False)

+---+-----------------+------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+----------------+--------------+------+
|age|workclass        |fnlwgt|education |education_num|marital_status     |occupation        |relationship  |race  |sex    |capital-gain|capital-loss|wrk_hrs_per_week|native_country|income|
+---+-----------------+------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+----------------+--------------+------+
|39 | State-gov       |77516 | Bachelors|13           | Never-married     | Adm-clerical     | Not-in-family| White| Male  |2174        |0           |40              | United-States| <=50K|
|50 | Self-emp-not-inc|83311 | Bachelors|13           | Married-civ-spouse| Exec-managerial  | Husband      | White| Male  |0           |0           |13              | United-States| <=50K|
|38 | Private         |215646| HS-grad  |9        

In [30]:
df_string = sqlContext.read.csv(SparkFiles.get("C:/Users/jvt/PYSPARKTRAINING/adult-data.csv"), header=True, inferSchema=  False)

df_string.printSchema()

root
 |-- age: string (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: string (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: string (nullable = true)
 |-- capital-loss: string (nullable = true)
 |-- wrk_hrs_per_week: string (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)



In [34]:
from pyspark.sql.types import *

# Write a custom function to convert the data type of DataFrame columns

def convertColumn(df, names, newType):
    for name in names: 
        df = df.withColumn(name, df[name].cast(newType))
    return df 

# List of continuous features
CONTI_FEATURES  = ['age', 'fnlwgt','capital-gain', 'education_num', 'capital-loss', 'wrk_hrs_per_week']
# Convert the type
df_string = convertColumn(df_string, CONTI_FEATURES, FloatType())
# Check the dataset
df_string.printSchema()

root
 |-- age: float (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: float (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: float (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: float (nullable = true)
 |-- capital-loss: float (nullable = true)
 |-- wrk_hrs_per_week: float (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)



In [35]:
from pyspark.ml.feature import StringIndexer
#stringIndexer = StringIndexer(inputCol="label", outputCol="newlabel")
#model = stringIndexer.fit(df)
#df = model.transform(df)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: integer (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- wrk_hrs_per_week: integer (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)



In [36]:
df.select('age','fnlwgt').show(5)

+---+------+
|age|fnlwgt|
+---+------+
| 39| 77516|
| 50| 83311|
| 38|215646|
| 53|234721|
| 28|338409|
+---+------+
only showing top 5 rows



In [37]:
df.groupBy("education").count().sort("count",ascending=True).show()

+-------------+-----+
|    education|count|
+-------------+-----+
|    Preschool|   51|
|      1st-4th|  168|
|      5th-6th|  333|
|    Doctorate|  413|
|         12th|  433|
|          9th|  514|
|  Prof-school|  576|
|      7th-8th|  646|
|         10th|  933|
|   Assoc-acdm| 1067|
|         11th| 1175|
|    Assoc-voc| 1382|
|      Masters| 1723|
|    Bachelors| 5355|
| Some-college| 7291|
|      HS-grad|10501|
+-------------+-----+



In [38]:
df.describe().show()

+-------+------------------+------------+------------------+-------------+-----------------+--------------+-----------------+------------+-------------------+-------+------------------+----------------+------------------+--------------+------+
|summary|               age|   workclass|            fnlwgt|    education|    education_num|marital_status|       occupation|relationship|               race|    sex|      capital-gain|    capital-loss|  wrk_hrs_per_week|native_country|income|
+-------+------------------+------------+------------------+-------------+-----------------+--------------+-----------------+------------+-------------------+-------+------------------+----------------+------------------+--------------+------+
|  count|             32561|       32561|             32561|        32561|            32561|         32561|            32561|       32561|              32561|  32561|             32561|           32561|             32561|         32561| 32561|
|   mean| 38.58164675532

In [40]:
df.describe('capital-gain').show()

+-------+------------------+
|summary|      capital-gain|
+-------+------------------+
|  count|             32561|
|   mean|1077.6488437087312|
| stddev| 7385.292084840354|
|    min|                 0|
|    max|             99999|
+-------+------------------+



In [43]:
df.drop('education_num').columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'wrk_hrs_per_week',
 'native_country',
 'income']

In [44]:
df.filter(df.age > 40).count()

13443

In [47]:
df.groupby('education').agg({'capital-gain': 'mean'}).show()

+-------------+------------------+
|    education| avg(capital-gain)|
+-------------+------------------+
|  Prof-school|10414.416666666666|
|         10th|404.57449088960345|
|      7th-8th|233.93962848297213|
|      5th-6th|176.02102102102103|
|   Assoc-acdm| 640.3992502343018|
|    Assoc-voc| 715.0513748191028|
|      Masters| 2562.563551944283|
|         12th| 284.0877598152425|
|    Preschool| 898.3921568627451|
|          9th|342.08949416342415|
|    Bachelors| 1756.299533146592|
|    Doctorate| 4770.145278450364|
|      HS-grad|  576.800114274831|
|         11th|215.09787234042554|
| Some-college| 598.8241667809629|
|      1st-4th|           125.875|
+-------------+------------------+



In [48]:
# Data preprocessing

In [53]:
from pyspark.sql.functions import col

age_square = df.select(col("age")**2)

# 2 Apply the transformation and add it to the DataFrame

df = df.withColumn("age_square", col("age")**2)

df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: integer (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- wrk_hrs_per_week: integer (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- age_square: double (nullable = true)



In [59]:
COLUMNS = ['age', 'age_square','education', 'education_num', 
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
           'wrk_hrs_per_week', 'native_country']

df = df.select(COLUMNS)

df.first()

Row(age=39, age_square=1521.0, education=' Bachelors', education_num=13, occupation=' Adm-clerical', relationship=' Not-in-family', race=' White', sex=' Male', capital-gain=2174, capital-loss=0, wrk_hrs_per_week=40, native_country=' United-States')

In [60]:
df.filter(df.native_country == 'Holand-Netherlands').count()



0

In [67]:
from pyspark.sql.functions import col, asc,desc

In [71]:
df.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
+-------------+----------+-----+------+---+-----+



In [72]:
df.groupby('department').agg({'department': 'count'}).sort(asc("count(department)")).show()

+----------+-----------------+
|department|count(department)|
+----------+-----------------+
|   Finance|                1|
|     Sales|                3|
+----------+-----------------+

