In [92]:
import pyspark
import sys 
from pyspark import SparkContext
from pyspark import SparkConf

## Using Filter and withColumn on a Dataframe

In [93]:
import random

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame([(1,"Alice"),
                            (2, "Bob"),
                            (3, "Carol"),
                            (4, "Aarons"),
                            (5, "Ave")],
                            ["id","name"])

df.show()

df1 = df.filter(df["name"].startswith('A'))
df1.show()

df2 = df.withColumn("age", (df["id"] * 10) - random.randint(2,6))
df2.show()



+---+------+
| id|  name|
+---+------+
|  1| Alice|
|  2|   Bob|
|  3| Carol|
|  4|Aarons|
|  5|   Ave|
+---+------+

+---+------+
| id|  name|
+---+------+
|  1| Alice|
|  4|Aarons|
|  5|   Ave|
+---+------+

+---+------+---+
| id|  name|age|
+---+------+---+
|  1| Alice|  7|
|  2|   Bob| 17|
|  3| Carol| 27|
|  4|Aarons| 37|
|  5|   Ave| 47|
+---+------+---+



## Count on Dataframe

In [106]:
spark = SparkSession.builder.getOrCreate()

df = df2

df.show()

print(f'Count = {df.count()}')

+---+------+---+
| id|  name|age|
+---+------+---+
|  1| Alice|  7|
|  2|   Bob| 17|
|  3| Carol| 27|
|  4|Aarons| 37|
|  5|   Ave| 47|
+---+------+---+

Count = 5


## Sum,Average on Dataframe

In [95]:
from pyspark.sql.functions import sum

spark = SparkSession.builder.getOrCreate()

df2.show()

sum = df2.groupBy().sum().collect()[0][1]

print(f'Sum of Ages: {sum}')

avg = (df2.groupBy().sum().collect()[0][1])/(df2.count())

print(f'Average of Ages: {avg}')


+---+------+---+
| id|  name|age|
+---+------+---+
|  1| Alice|  7|
|  2|   Bob| 17|
|  3| Carol| 27|
|  4|Aarons| 37|
|  5|   Ave| 47|
+---+------+---+

Sum of Ages: 135
Average of Ages: 27.0


## Write PySpark Dataframe to CSV File

In [96]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

df_final = df2

df_final.show()

output_path = "/home/lplab/Desktop/210962021/Lab2/output.csv"

df_final.repartition(1).write.csv(output_path,header = True)

+---+------+---+
| id|  name|age|
+---+------+---+
|  1| Alice|  7|
|  2|   Bob| 17|
|  3| Carol| 27|
|  4|Aarons| 37|
|  5|   Ave| 47|
+---+------+---+



## Implement Word Count Program

In [103]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f 

spark = SparkSession.builder.getOrCreate()

lines = spark.read.text("wordcount.txt")

words = lines.withColumn('word',f.explode(f.split(f.col('value'),' ')))

word_counts = words.groupBy('word').agg(f.count('word').alias('count'))

word_counts.show(word_counts.count(),truncate = False)


+----------+-----+
|word      |count|
+----------+-----+
|Processing|1    |
|example   |5    |
|count     |4    |
|data      |1    |
|Hello     |1    |
|word      |1    |
|sample    |1    |
|PySpark   |4    |
|world     |3    |
|text      |2    |
|Word      |1    |
+----------+-----+

