In [1]:
import os
import sys
from pyspark import SparkContext
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('df-project').getOrCreate()
print(spark)
sc = spark.sparkContext
print(sc)

<pyspark.sql.session.SparkSession object at 0x00000209E10F9540>
<SparkContext master=local[*] appName=df-project>


In [3]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x00000209E10F9540>


In [16]:
spark.sparkContext.version

'3.4.1'

In [13]:
#range function creates the dataframe with start , end , step
df = spark.range(1,10,2)
type(df)
df.show()

+---+
| id|
+---+
|  1|
|  3|
|  5|
|  7|
|  9|
+---+



In [14]:
# Will create df with 1- 19
df = spark.range(20)
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



In [16]:
#Getting num partition in range function
df = spark.range(1,20,2)
df.show()
print(str(df.rdd.getNumPartitions()))

#Setting the num partitions through function
df = spark.range(1,20,2,numPartitions=6)
df.show()
print(str(df.rdd.getNumPartitions()))

+---+
| id|
+---+
|  1|
|  3|
|  5|
|  7|
|  9|
| 11|
| 13|
| 15|
| 17|
| 19|
+---+

4
+---+
| id|
+---+
|  1|
|  3|
|  5|
|  7|
|  9|
| 11|
| 13|
| 15|
| 17|
| 19|
+---+

6


In [20]:
lst = [("Ram",23),("sham",34)]
#Creating the dataFrame from the list of tuple
print('Creating DF without the schema')
df = spark.createDataFrame(lst)
df.show()

#Creating the dataFram from the list and schema 
print('Creating DF with the schema')
schema = ['Name','Age']
df = spark.createDataFrame(lst,schema)
df.printSchema()
df.show()

#Passing Data to the schema variable

df = spark.createDataFrame(lst,schema=('Name string, Age int'))
df.printSchema()
df.show()

Creating DF without the schema
+----+---+
|  _1| _2|
+----+---+
| Ram| 23|
|sham| 34|
+----+---+

Creating DF with the schema
root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)

+----+---+
|Name|Age|
+----+---+
| Ram| 23|
|sham| 34|
+----+---+

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)

+----+---+
|Name|Age|
+----+---+
| Ram| 23|
|sham| 34|
+----+---+



In [13]:
# Creating Datafram from dict
dict = [{"name":"Ram",'age':34},{"name":"Sham",'age':35}]
df = spark.createDataFrame(dict)
df.printSchema()
df.show()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

+---+----+
|age|name|
+---+----+
| 34| Ram|
| 35|Sham|
+---+----+



In [23]:
# Creating DF from RDD
lst = [("Ram",23),("sham",34)]
rdd = sc.parallelize(lst)
df = spark.createDataFrame(rdd)
df.printSchema()
df.show()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)

+----+---+
|  _1| _2|
+----+---+
| Ram| 23|
|sham| 34|
+----+---+



In [24]:
#Creating Df using ROW
from pyspark.sql import Row
rdd = sc.parallelize([Row(name='ram',age=23),Row(name='ram',age=23),Row(name='ram',age=23)])
df = spark.createDataFrame(rdd)
df.show()
df.printSchema()

+----+---+
|name|age|
+----+---+
| ram| 23|
| ram| 23|
| ram| 23|
+----+---+

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [28]:
# Creating the DF using pandas
import pandas as pd 
#Creating the nested list 
lst = [['ram',23],['sham',45]]
data_frame = pd.DataFrame(data=lst,columns=['Name','Age'])
df = spark.createDataFrame(data_frame)
df.printSchema()
df.show()

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)

+----+---+
|Name|Age|
+----+---+
| ram| 23|
|sham| 45|
+----+---+



In [36]:
# Running SQL query using the spark
lst=[('ram',34),('sham',45)]
df=spark.createDataFrame(lst,['Name','age'])
#Creating the session view
df.createOrReplaceTempView("emp")
spark.sql(""" select * from emp where Name='ram' """).show()

#Creatig the global view
df.createOrReplaceGlobalTempView('emp_global')
spark.sql(""" select * from global_temp.emp_global where Name='ram' """).show()

+----+---+
|Name|age|
+----+---+
| ram| 34|
+----+---+

+----+---+
|Name|age|
+----+---+
| ram| 34|
+----+---+



In [37]:
#Using the tables function
lst=[('ram',34),('sham',45)]
df=spark.createDataFrame(lst,['Name','age'])
df.createOrReplaceTempView("emp")
df1 = spark.table("emp")
df1.show()

+----+---+
|Name|age|
+----+---+
| ram| 34|
|sham| 45|
+----+---+

