In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("DataFrame Operations") \
    .getOrCreate()

# Create sample DataFrames
df1 = spark.createDataFrame([(1, 'John', 25),
                              (2, 'Alice', 30),
                              (3, 'Bob', 28)],
                             ['id', 'name', 'age'])

df2 = spark.createDataFrame([(1, 'New York'),
                              (2, 'Los Angeles'),
                              (3, 'Chicago')],
                             ['id', 'city'])

# Joining DataFrames
joined_df = df1.join(df2, on='id')

# GroupBy and Aggregations
grouped_df = df1.groupBy('name').agg(avg('age').alias('average_age'))

# Droping
df1_drop = df1.drop('age')

# Sorting
sorted_df1 = df1.orderBy('age')

# Displaying results
print("Join")
joined_df.show()
print("Group By and Aggregations")
grouped_df.show()
print("Drop")
df1_drop.show()
print("Sort")
sorted_df1.show()




Join
+---+-----+---+-----------+
| id| name|age|       city|
+---+-----+---+-----------+
|  1| John| 25|   New York|
|  2|Alice| 30|Los Angeles|
|  3|  Bob| 28|    Chicago|
+---+-----+---+-----------+

Group By and Aggregations
+-----+-----------+
| name|average_age|
+-----+-----------+
| John|       25.0|
|Alice|       30.0|
|  Bob|       28.0|
+-----+-----------+

Drop
+---+-----+
| id| name|
+---+-----+
|  1| John|
|  2|Alice|
|  3|  Bob|
+---+-----+

Sort
+---+-----+---+
| id| name|age|
+---+-----+---+
|  1| John| 25|
|  3|  Bob| 28|
|  2|Alice| 30|
+---+-----+---+



In [0]:
import pandas as pd

# Create sample DataFrame
data = {'Name': ['John', 'Alice', 'Bob'],
        'Age': [25, 30, 28],
        'City': ['New York', 'Los Angeles', 'Chicago']}
df = pd.DataFrame(data)


# Joining DataFrames
data2 = {'Name': ['John', 'Alice', 'Bob'],
         'Salary': [50000, 60000, 55000]}
df2 = pd.DataFrame(data2)
merged_df = pd.merge(df, df2, on='Name')

# GroupBy and Aggregations
grouped_df = df.groupby('Name').agg({'Age': 'mean'})

# Dropping
df_dropped = df.drop(columns=['Age'])

# Sorting
sorted_df = df.sort_values(by='Age')

# Displaying results
print("Merged Function Result - ")
print(merged_df)
print("Group BY and Aggregation Function Result -")
print(grouped_df)
print("Drop Function Result - ")
print(df_dropped)
print("Sort Function Result - ")
print(sorted_df)


Merged Function Result - 
    Name  Age         City  Salary
0   John   25     New York   50000
1  Alice   30  Los Angeles   60000
2    Bob   28      Chicago   55000
Group BY and Aggregation Function Result -
        Age
Name       
Alice  30.0
Bob    28.0
John   25.0
Drop Function Result - 
    Name         City
0   John     New York
1  Alice  Los Angeles
2    Bob      Chicago
Sort Function Result - 
    Name  Age         City
0   John   25     New York
2    Bob   28      Chicago
1  Alice   30  Los Angeles


In [0]:
from pyspark.sql import SparkSession
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("SparkSQL Joins") \
    .getOrCreate()
# Create sample DataFrames
df1 = spark.createDataFrame([(1, 'John', 25),
                             (2, 'Alice', 30),
                             (3, 'Bob', 28)],
                            ['id', 'name', 'age'])

df2 = spark.createDataFrame([(1, 'New York'),
                             (2, 'Los Angeles'),
                             (3, 'Chicago')],
                            ['id', 'city'])
# Register DataFrames as temporary views
df1.createOrReplaceTempView("df1_view")
df2.createOrReplaceTempView("df2_view")
# Perform SQL join
joined_df = spark.sql("SELECT df1_view.*, df2_view.city FROM df1_view JOIN df2_view ON df1_view.id = df2_view.id")
# Displaying results
joined_df.show()




+---+-----+---+-----------+
| id| name|age|       city|
+---+-----+---+-----------+
|  1| John| 25|   New York|
|  2|Alice| 30|Los Angeles|
|  3|  Bob| 28|    Chicago|
+---+-----+---+-----------+



In [0]:
import pandas as pd

# Create sample DataFrame
data = {'Name': ['John', 'Alice', 'Bob'],
        'Age': [25, 30, 28]}
df = pd.DataFrame(data)

# Define a custom function
def add_suffix(name):
    return name + "_suffix"

# Apply function to a column
df['Name'] = df['Name'].apply(add_suffix)

# Displaying results
print(df)




           Name  Age
0   John_suffix   25
1  Alice_suffix   30
2    Bob_suffix   28
