In [0]:
%lsmagic

In [0]:
%fs ls dbfs:/Volumes

##Create a dataframe from list of tuples

In [0]:
# Create a Spark DataFrame from a list of tuples
data = [("Alice", 34), ("Bob", 45), ("Cathy", 29)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

#create a temporary view of the dataframe
df.createOrReplaceTempView("temp_table")
display(spark.sql("select * from temp_table"))

# Display the DataFrame
display(df)

# Show the schema of the DataFrame
df.printSchema()

# Select and display the "Name" column
display(df.select("Name"))

# Filter and display rows where age is greater than 30
display(df.filter(df.Age > 30))

# Group by "Age" and count the number of occurrences
display(df.groupBy("Age").count())

###some other ways to  create dataframes

In [0]:
#from a pandas dataframe
import pandas as pd
data = {'Name': ['Alice', 'Bob', 'Cathy'],
        'Age': [34, 45, 29]}
pdf = pd.DataFrame(data)
display(pdf)
# Create a Spark DataFrame from a pandas DataFrame
spark_df = spark.createDataFrame(pdf)
display(spark_df)

#from a CSV file
df=spark.read.csv("path of the csv file", header=True, inferSchema=True)

#from a JSON file
df=spark.read.json("path of the json file")

#from a parquet file
df=spark.read.parquet("path of the parquet file")

#from a text file
df=spark.read.text("path of the text file")

#from an existing hive table
df=spark.sql("select * from table_name")
# Create a temporary view from the DataFrame
df.createOrReplaceTempView("my_view")

##some aggregate operations in both as functions and as expressions

In [0]:
# Calculate and display the average age
from pyspark.sql import functions as F
# Calculate and display the average age
df.agg(F.avg('Age').alias('Average_Age')).show()

# Calculate and display the average age
df.selectExpr('avg(Age) as Average_Age').show()

# Calculate and display the sum of ages
df.agg(F.sum('Age').alias('Total_Age')).show()

# Calculate and display the sum of ages
df.selectExpr('sum(Age) as Total_Age').show()

# Calculate and display the minimum age
df.agg(F.min('Age').alias('Min_Age')).show()

# Calculate and display the minimum age
df.selectExpr('min(Age)as min_age').show()

# Calculate and display the maximum age
df.agg(F.max('Age').alias('Max_Age')).show()

# Calculate and display the maximum age
df.selectExpr('max(Age) as max_age').show()

# Calculate and display the standard deviation of ages
df.agg(F.stddev('Age').alias('Stddev_Age')).show()

# Calculate and display the standard deviation of ages
df.selectExpr('stddev(Age) as stddev_age').show()

# Calculate and display the variance of ages
df.agg(F.variance('Age').alias('Variance_Age')).show()

# Calculate and display the variance of ages
df.selectExpr('variance(Age) as variance_age').show()

# Calculate and display the count of rows
df.agg(F.count('Age').alias('Count_Age')).show()

# Calculate and display the count of rows
df.selectExpr('count(Age) as age_count').show()

# Calculate and display the approximate count distinct of ages
df.agg(F.approx_count_distinct('Age').alias('Approx_Count_Distinct_Age')).show()

# Calculate and display the first age
df.agg(F.first('Age').alias('First_Age')).show()

# Calculate and display the first age
df.selectExpr('first(Age) as first_age').show()

# Calculate and display the last age
df.agg(F.last('Age').alias('Last_Age')).show()

# Calculate and display the last age
df.selectExpr('last(Age) as last_age').show()


###few more operations

In [0]:
from pyspark.sql.functions import when, lit
# Add a new column "Senior" based on the condition if age is greater than 40
df = df.withColumn("Senior", when(df.Age > 40,"yes").otherwise("no"))
display(df)

# Sort the DataFrame by "Age" in descending order
df_sorted = df.orderBy(df.Age.desc())
display(df_sorted)

# Drop the "Senior" column
df = df.drop("Senior")
display(df)

##some file handling commands

In [0]:
# Create (touch) a new empty file
dbutils.fs.put("/file path/", "", True)

# Write text to a file
dbutils.fs.put("/file path/hello.txt", "Hello, Databricks!", True)

# List files in a directory
display(dbutils.fs.ls("/directorypath/"))

# Read the contents of a file
print(dbutils.fs.head("/file path/hello.txt"))

# Move (rename) a file
dbutils.fs.mv("file:/tmp/hello.txt", "file:/tmp/greeting.txt")

# Copy a file
dbutils.fs.cp("file:/tmp/greeting.txt", "file:/tmp/greeting_copy.txt")

# Remove (delete) a file
dbutils.fs.rm("file:/tmp/example.txt")

# Remove a directory and its contents recursively
dbutils.fs.rm("file:/tmp/", True)

##some basic shell commands

In [0]:
# List all available magic commands
#%lsmagic

# Display the current working directory
#%pwd

# List files in the current directory
#%ls

# Measure the execution time of a Python code snippet
#%timeit df.filter(df.Age > 30).count()

# Run a shell command
#!echo "Hello, Databricks!"

# Display the history of commands executed
#%history

In [0]:
%fs ls dbfs:/databricks-datasets/COVID/