# Spark Dataframes and Operations Code

## Create Dataframe Operations

In [None]:
import pandas as pd
from datetime import datetime, date
from pyspark.sql import Row

data_df = spark.createDataFrame([
    Row(col_1=100, col_2=200., col_3='string_test_1', col_4=date(2023, 1, 1), col_5=datetime(2023, 1, 1, 12, 0)),
    Row(col_1=200, col_2=300., col_3='string_test_2', col_4=date(2023, 2, 1), col_5=datetime(2023, 1, 2, 12, 0)),
    Row(col_1=400, col_2=500., col_3='string_test_3', col_4=date(2023, 3, 1), col_5=datetime(2023, 1, 3, 12, 0))
])


In [None]:
import pandas as pd
from datetime import datetime, date
from pyspark.sql import Row

data_df = spark.createDataFrame([
    Row(col_1=100, col_2=200., col_3='string_test_1', col_4=date(2023, 1, 1), col_5=datetime(2023, 1, 1, 12, 0)),
    Row(col_1=200, col_2=300., col_3='string_test_2', col_4=date(2023, 2, 1), col_5=datetime(2023, 1, 2, 12, 0)),
    Row(col_1=400, col_2=500., col_3='string_test_3', col_4=date(2023, 3, 1), col_5=datetime(2023, 1, 3, 12, 0))
], schema=' col_1 long, col_2 double, col_3 string, col_4 date, col_5 timestamp')


In [None]:
import pandas as pd
from datetime import datetime, date
from pyspark.sql import Row

pandas_df = pd.DataFrame({
    'col_1': [100, 200, 400],
    'col_2': [200., 300., 500.],
    'col_3': ['string_test_1', 'string_test_2', 'string_test_3'],
    'col_4': [date(2023, 1, 1), date(2023, 2, 1), date(2023, 3, 1)],
    'col_5': [datetime(2023, 1, 1, 12, 0), datetime(2023, 1, 2, 12, 0), datetime(2023, 1, 3, 12, 0)]
})
df = spark.createDataFrame(pandas_df)


In [None]:
from datetime import datetime, date
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Crear directamente un DataFrame sin usar sparkContext
data = [
    (100, 200., 'string_test_1', date(2023, 1, 1), datetime(2023, 1, 1, 12, 0)),
    (200, 300., 'string_test_2', date(2023, 2, 1), datetime(2023, 1, 2, 12, 0)),
    (300, 400., 'string_test_3', date(2023, 3, 1), datetime(2023, 1, 3, 12, 0))
]

# Crear DataFrame directamente desde la lista de datos
data_df = spark.createDataFrame(data, schema=['col_1', 'col_2', 'col_3', 'col_4', 'col_5'])

# Mostrar el DataFrame
data_df.show()

## How to View the Dataframes

In [None]:
data_df.show()

In [None]:
data_df.show(2)

In [None]:
data_df.printSchema()

In [None]:
data_df.show(1, vertical=True)

In [None]:
data_df.columns

In [None]:
data_df.count()

In [None]:
data_df.select('col_1', 'col_2', 'col_3').describe().show()

## Collecting the data

In [None]:
data_df.collect()

In [None]:
data_df.take(1)

In [None]:
data_df.tail(1)

In [None]:
data_df.head(1)

## Converting a PySpark DataFrame to a Pandas DataFrame

In [None]:
data_df.toPandas()

## How to do Data Manipulation - Rows and Columns

In [None]:
from pyspark.sql import Column

data_df.select(data_df.col_3).show()


In [None]:
from pyspark.sql import functions as F
data_df = data_df.withColumn("col_6", F.lit("A"))
data_df.show()


In [None]:
data_df = data_df.drop("col_5")
data_df.show()


In [None]:
data_df.withColumn("col_2", F.col("col_2") / 100).show()

In [None]:
data_df = data_df.withColumnRenamed("col_3", "string_col")
data_df.show()


In [None]:
data_df.select("col_6").distinct().show()

In [None]:
data_df.select(F.countDistinct("col_6").alias("Total_Unique")).show()

In [None]:
from pyspark.sql.functions import upper

data_df.withColumn('upper_string_col', upper(data_df.string_col)).show()


In [None]:
data_df.filter(data_df.col_1 == 100).show()

In [None]:
data_df.filter((data_df.col_1 == 100)
		& (data_df.col_6 == 'A')).show()


In [None]:
data_df.filter((data_df.col_1 == 100)
		| (data_df.col_2 == 300.00)).show()


In [None]:
list = [100, 200]
data_df.filter(data_df.col_1.isin(list)).show()


In [None]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType

data_df_2 = data_df.withColumn("col_4",col("col_4").cast(StringType())) \
    .withColumn("col_1",col("col_1").cast(IntegerType()))
data_df_2.printSchema()
data_df.show()



In [None]:
data_df_3 = data_df_2.selectExpr("cast(col_4 as date) col_4",
    "cast(col_1 as long) col_1")
data_df_3.printSchema()


In [None]:
data_df_3.createOrReplaceTempView("CastExample")
data_df_4 = spark.sql("SELECT DOUBLE(col_1), DATE(col_4) from CastExample")
data_df_4.printSchema()
data_df_4.show(truncate=False)


In [None]:
salary_data = [("John", "Field-eng", 3500), 
    ("Michael", "Field-eng", 4500), 
    ("Robert", None, 4000), 
    ("Maria", "Finance", 3500), 
    ("John", "Sales", 3000), 
    ("Kelly", "Finance", 3500), 
    ("Kate", "Finance", 3000), 
    ("Martin", None, 3500), 
    ("Kiran", "Sales", 2200), 
    ("Michael", "Field-eng", 4500) 
  ]
columns= ["Employee", "Department", "Salary"]
salary_data = spark.createDataFrame(data = salary_data, schema = columns)
salary_data.printSchema()
salary_data.show()


In [None]:
salary_data.dropna().show()

In [None]:
new_salary_data = salary_data.dropDuplicates().show()

Using Aggregrates in a Dataframe

In [None]:
from pyspark.sql.functions import countDistinct, avg
salary_data.select(avg('Salary')).show()


In [None]:
salary_data.agg({'Salary':'count'}).show()

In [None]:
salary_data.select(countDistinct("Salary").alias("Distinct Salary")).show()

In [None]:
salary_data.agg({'Salary':'max'}).show() 

In [None]:
salary_data.agg({'Salary':'sum'}).show()

In [None]:
salary_data.orderBy("Salary").show()

In [None]:
salary_data.orderBy(salary_data["Salary"].desc()).show()