In [1]:
import pyspark 
import pandas as pd 
import numpy as np

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName('CDrop') \
        .getOrCreate()

In [3]:
filepath = "file2.csv"

In [4]:
pf = spark.read.csv(filepath, header = True, inferSchema = True)

In [7]:
pf.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [8]:
pf.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [10]:
pf.drop('Name').show(3)

+---+----------+------+
|age|Experience|Salary|
+---+----------+------+
| 31|        10| 30000|
| 30|         8| 25000|
| 29|         4| 20000|
+---+----------+------+
only showing top 3 rows



INDEXING ROWS

In [18]:
pf.collect()[0:3]

[Row(Name='Krish', age=31, Experience=10, Salary=30000),
 Row(Name='Sudhanshu', age=30, Experience=8, Salary=25000),
 Row(Name='Sunny', age=29, Experience=4, Salary=20000)]

In [28]:
# Takes the first row out of the rows returned
print(pf.collect()[0:3][0])

# Takes the last row out of the rows returned

print(pf.collect()[0:3][-1])

Row(Name='Krish', age=31, Experience=10, Salary=30000)
Row(Name='Sunny', age=29, Experience=4, Salary=20000)


In [41]:
pfcollect = pf.collect()

In [48]:
for row in pfcollect:
    print('Name: ' , row['Name'], ', ', 'Salary: ',row['Salary'])

Name:  Krish ,  Salary:  30000
Name:  Sudhanshu ,  Salary:  25000
Name:  Sunny ,  Salary:  20000
Name:  Paul ,  Salary:  20000
Name:  Harsha ,  Salary:  15000
Name:  Shubham ,  Salary:  18000
Name:  Mahesh ,  Salary:  40000
Name:  None ,  Salary:  38000
Name:  None ,  Salary:  None


In [52]:
for row in pfcollect:
    NameExp = 'Name: ' , row['Name'], ', ', 'Salary: ',row['Salary']
    print(type(NameExp))

<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>


In [58]:
pf.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [59]:
pf.na.drop().show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [79]:
pf.na.drop(how="any", thresh=3).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     null| 34|        10| 38000|
+---------+---+----------+------+



In [87]:
pf.filter(pf.Experience.isNull()).show()

+------+----+----------+------+
|  Name| age|Experience|Salary|
+------+----+----------+------+
|Mahesh|null|      null| 40000|
|  null|  36|      null|  null|
+------+----+----------+------+



In [144]:
from pyspark.ml.feature import Imputer 

imputer = Imputer (
    inputCols = ['age', 'Experience', 'Salary'], 
    outputCols = ["{}_inputed".format(c) for c in ['age', 'Experience', 'Salary']]
).setStrategy('median') 

In [145]:
new_pf = imputer.fit(pf).transform(pf)

In [146]:
new_pf.show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_inputed|Experience_inputed|Salary_inputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|null|      null| 40000|         29|                 4|         40000|
|     null|  34|        10| 38000|         34|                10|         38000|
|     null|  36|      null|  null|         36|                 4|         20000|
+---------+----+----------+-

In [147]:
cleanPf = new_pf.drop('age', 'Experience', 'Salary')

In [148]:
cleanPf.show()

+---------+-----------+------------------+--------------+
|     Name|age_inputed|Experience_inputed|Salary_inputed|
+---------+-----------+------------------+--------------+
|    Krish|         31|                10|         30000|
|Sudhanshu|         30|                 8|         25000|
|    Sunny|         29|                 4|         20000|
|     Paul|         24|                 3|         20000|
|   Harsha|         21|                 1|         15000|
|  Shubham|         23|                 2|         18000|
|   Mahesh|         29|                 4|         40000|
|     null|         34|                10|         38000|
|     null|         36|                 4|         20000|
+---------+-----------+------------------+--------------+



In [158]:
cleanPf = cleanPf.withColumnRenamed('age_inputed', 'age')
cleanPf = cleanPf.withColumnRenamed('Experience_inputed', 'Experience')
cleanPf = cleanPf.withColumnRenamed('Salary_inputed', 'Salary')

In [160]:
cleanPf.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|   Mahesh| 29|         4| 40000|
|     null| 34|        10| 38000|
|     null| 36|         4| 20000|
+---------+---+----------+------+

