In [1]:
# dataframes

In [8]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Test1").getOrCreate()

sc=spark.sparkContext

In [None]:
df1 = spark.read.csv("iris/iris.csv",header=True,inferSchema=True,sep=",")

df1.show()

+------------+-----------+------------+-----------+-------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|Species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [7]:
print(df1)

DataFrame[Sepal_Length: double, Sepal_Width: double, Petal_Length: double, Petal_Width: double, Species: string]


In [None]:
# from textfile(rdd to data frame)
data = sc.textFile("students.txt")

data = data.map(lambda line: line.split(","))
data = data.map(lambda x:( x[0][:4], x[0][4:], x[1], *x[2].split(":")))
data.collect()

# convert to dataframe
# can be also done using .toDF()
# df2=spark.createDataFrame(data) # creates with default columns
df2=spark.createDataFrame(data,["id","name","gender","Maths","physics","english","java"])

df2.show()

+----+------+------+-----+-------+-------+----+
|  id|  name|gender|Maths|physics|english|java|
+----+------+------+-----+-------+-------+----+
|8955| Tarun|  Male|   78|     50|     45|  25|
|8871|   Ali|  Male|  100|    100|     99|  98|
|8892|  Rafi|  Male|  100|     91|    100|  99|
|8912|  Uday|  Male|  100|     97|     99|  98|
|8910|Random|Female|   80|     40|     60|  54|
+----+------+------+-----+-------+-------+----+



In [None]:
# one more better option is use col function 
# col() is used to refer a column as an expression
from pyspark.sql.functions import col
df2=df2.select(
    col("id"),
    col("name"),
    col("gender"),
    col("Maths").cast("int"),
    col("physics").cast("int"),
    col("english").cast("int"),
    col("java").cast("int")
)

# for simple selecting like df.name col is not used

# while doing calculations/filtering/logic --> use col()

#now its converted to proper datatypes
df2.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- Maths: integer (nullable = true)
 |-- physics: integer (nullable = true)
 |-- english: integer (nullable = true)
 |-- java: integer (nullable = true)



In [14]:
# df to rdd
df1.rdd.map(tuple).take(10)

[(5.1, 3.5, 1.4, 0.2, 'setosa'),
 (4.9, 3.0, 1.4, 0.2, 'setosa'),
 (4.7, 3.2, 1.3, 0.2, 'setosa'),
 (4.6, 3.1, 1.5, 0.2, 'setosa'),
 (5.0, 3.6, 1.4, 0.2, 'setosa'),
 (5.4, 3.9, 1.7, 0.4, 'setosa'),
 (4.6, 3.4, 1.4, 0.3, 'setosa'),
 (5.0, 3.4, 1.5, 0.2, 'setosa'),
 (4.4, 2.9, 1.4, 0.2, 'setosa'),
 (4.9, 3.1, 1.5, 0.1, 'setosa')]

In [None]:
df2.dtypes # all data types

df2.columns # all columns

['id', 'name', 'gender', 'Maths', 'physics', 'english', 'java']

In [16]:
df2.select("id","name").show()
# renaming column name
df2.select(col("Maths").alias("maths")).show()

+----+------+
|  id|  name|
+----+------+
|8955| Tarun|
|8871|   Ali|
|8892|  Rafi|
|8912|  Uday|
|8910|Random|
+----+------+

+-----+
|maths|
+-----+
|   78|
|  100|
|  100|
|  100|
|   80|
+-----+



In [19]:
# StructType (used to define the schema of the row)
import pyspark
from pyspark.sql.types import StructType,StructField,IntegerType,StringType

empSchema=pyspark.sql.types.StructType([
    StructField("CustomerId",IntegerType(),True),
StructField("CustomerName",StringType(),True),
StructField("CustomerLocation",StringType(),True)
])

print(empSchema)

StructType([StructField('CustomerId', IntegerType(), True), StructField('CustomerName', StringType(), True), StructField('CustomerLocation', StringType(), True)])


In [None]:
# assigning defined structure to dataframe
import pyspark
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

empSchema = pyspark.sql.types.StructType(
    [
        StructField("CustomerId", IntegerType(), True),
        StructField("CustomerName", StringType(), True),
        StructField("CustomerLocation", StringType(), True),
    ]
)
df = (
    spark.read.format("csv")
    .schema(empSchema)
    .option("header", True)
    .load("Cust_address.csv")
)
df.show()
df.dtypes

# df.drop("CustomerLocation") # drop column

+----------+------------+--------------------+
|CustomerId|CustomerName|    CustomerLocation|
+----------+------------+--------------------+
|      1002|        Aman|     1 Anthes Avenue|
|      1003|       Harsh|   87985 Linden Pass|
|      1004|       Ayush| 56 La Follette Pass|
|      1005|       Aditi|  8 Briar Crest Pass|
|      1006|      Anjali|    035 Iowa Terrace|
|      1007|     Shubham|    3925 Clove Drive|
|      1008|     Anushka|    9 Straubel Drive|
|      1009|       Rohit|   816 Northland Way|
|      1010|     Saurabh|    10165 Gerald Way|
|      1011|      Muskan|83 Merchant Junction|
|      1012|       Rahul|1249 Summerview Pass|
|      1013|     Utkarsh|   5 Bowman Junction|
|      1014|     Vaibhav|      3 Chinook Park|
|      1015|        Amit|   90535 Bonner Lane|
|      1016|      Saumya| 056 Straubel Avenue|
|      1017|     Rishabh| 70 Wayridge Parkway|
|      1018|      Shruti|     609 Truax Alley|
|      1019|    Himanshu|948 Marquette Circle|
|      1020| 

[('CustomerId', 'int'),
 ('CustomerName', 'string'),
 ('CustomerLocation', 'string')]