### Create a dataframe

In [0]:
import datetime
from pyspark.sql import Row

attributes = ["user_id", "name", "age", "gender", "city", "occupation", "education", "weight_in_kg", "time_of_birth", "date_of_joining"]

users = [
    {
        "user_id": 1,
        "name": "John",
        "age": 25,
        "gender": "Male",
        "city": "New York",
        "occupation": "Engineer",
        "education": "Bachelor's",
        "weight_in_kg": 70.5,
        "time_of_birth": datetime.datetime(1995, 5, 10, 8, 30),
        "date_of_joining": datetime.date(2020, 1, 1)
    },
    {
        "user_id": 2,
        "name": "Jane",
        "age": 30,
        "gender": "Female",
        "city": "San Francisco",
        "occupation": "Doctor",
        "education": "Master's",
        "weight_in_kg": 65.2,
        "time_of_birth": datetime.datetime(1990, 8, 15, 12, 0),
        "date_of_joining": datetime.date(2018, 6, 15)
    },
    {
        "user_id": 3,
        "name": "Mike",
        "age": 35,
        "gender": "Male",
        "city": "Chicago",
        "occupation": "Teacher",
        "education": "PhD",
        "weight_in_kg": 80.7,
        "time_of_birth": datetime.datetime(1985, 3, 20, 10, 45),
        "date_of_joining": datetime.date(2015, 3, 1)
    },
    {
        "user_id": 4,
        "name": "Emily",
        "age": 28,
        "gender": "Female",
        "city": "Los Angeles",
        "occupation": "Lawyer",
        "education": "Bachelor's",
        "weight_in_kg": 60.9,
        "time_of_birth": datetime.datetime(1992, 11, 5, 9, 15),
        "date_of_joining": datetime.date(2019, 9, 10)
    },
    {
        "user_id": 5,
        "name": "David",
        "age": 32,
        "gender": "Male",
        "city": "Seattle",
        "occupation": "Software Engineer",
        "education": "Master's",
        "weight_in_kg": 75.3,
        "time_of_birth": datetime.datetime(1988, 7, 25, 14, 30),
        "date_of_joining": datetime.date(2017, 4, 5)
    }
]

users_as_spark_rows = [Row(**user) for user in users]

df = spark.createDataFrame(users_as_spark_rows)

df.show()

+-------+-----+---+------+-------------+-----------------+----------+------------+-------------------+---------------+
|user_id| name|age|gender|         city|       occupation| education|weight_in_kg|      time_of_birth|date_of_joining|
+-------+-----+---+------+-------------+-----------------+----------+------------+-------------------+---------------+
|      1| John| 25|  Male|     New York|         Engineer|Bachelor's|        70.5|1995-05-10 08:30:00|     2020-01-01|
|      2| Jane| 30|Female|San Francisco|           Doctor|  Master's|        65.2|1990-08-15 12:00:00|     2018-06-15|
|      3| Mike| 35|  Male|      Chicago|          Teacher|       PhD|        80.7|1985-03-20 10:45:00|     2015-03-01|
|      4|Emily| 28|Female|  Los Angeles|           Lawyer|Bachelor's|        60.9|1992-11-05 09:15:00|     2019-09-10|
|      5|David| 32|  Male|      Seattle|Software Engineer|  Master's|        75.3|1988-07-25 14:30:00|     2017-04-05|
+-------+-----+---+------+-------------+--------

### printSchema

In [0]:
df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- city: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- education: string (nullable = true)
 |-- weight_in_kg: double (nullable = true)
 |-- time_of_birth: timestamp (nullable = true)
 |-- date_of_joining: date (nullable = true)



### Show datatype of each column in the dataframe

In [0]:
df.dtypes

[('user_id', 'bigint'),
 ('name', 'string'),
 ('age', 'bigint'),
 ('gender', 'string'),
 ('city', 'string'),
 ('occupation', 'string'),
 ('education', 'string'),
 ('weight_in_kg', 'double'),
 ('time_of_birth', 'timestamp'),
 ('date_of_joining', 'date')]

In [0]:
help(df.withColumn)

Help on method withColumn in module pyspark.sql.dataframe:

withColumn(colName: str, col: pyspark.sql.column.Column) -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` by adding a column or replacing the
    existing column that has the same name.
    
    The column expression must be an expression over this :class:`DataFrame`; attempting to add
    a column from some other :class:`DataFrame` will raise an error.
    
    .. versionadded:: 1.3.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    colName : str
        string, name of the new column.
    col : :class:`Column`
        a :class:`Column` expression for the new column.
    
    Returns
    -------
    :class:`DataFrame`
        DataFrame with new or replaced column.
    
    Notes
    -----
    This method introduces a projection internally. Therefore, calling it multiple
    times, for instance, via loops in order to 

### col

In [0]:
from pyspark.sql.functions import col
help(col)

Help on function col in module pyspark.sql.functions.builtin:

col(col: str) -> pyspark.sql.column.Column
    Returns a :class:`~pyspark.sql.Column` based on the given column name.
    
    .. versionadded:: 1.3.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    col : str
        the name for the column
    
    Returns
    -------
    :class:`~pyspark.sql.Column`
        the corresponding column instance.
    
    Examples
    --------
    >>> col('x')
    Column<'x'>
    >>> column('x')
    Column<'x'>



In [0]:
df.select('user_id', 'name', 'age', 'weight_in_kg').withColumn('weight', col('weight_in_kg')).show()

+-------+-----+---+------------+------+
|user_id| name|age|weight_in_kg|weight|
+-------+-----+---+------------+------+
|      1| John| 25|        70.5|  70.5|
|      2| Jane| 30|        65.2|  65.2|
|      3| Mike| 35|        80.7|  80.7|
|      4|Emily| 28|        60.9|  60.9|
|      5|David| 32|        75.3|  75.3|
+-------+-----+---+------------+------+



### withColumnRenamed

In [0]:
df.select('user_id', 'name', 'age').withColumnRenamed('name', 'user_name').withColumnRenamed('age', 'user_age').show()

+-------+---------+--------+
|user_id|user_name|user_age|
+-------+---------+--------+
|      1|     John|      25|
|      2|     Jane|      30|
|      3|     Mike|      35|
|      4|    Emily|      28|
|      5|    David|      32|
+-------+---------+--------+



### withColumnsRenamed

In [0]:
df.select('user_id', 'name', 'age').withColumnsRenamed({'name': 'user_name', 'age': 'user_age'}).show()

+-------+---------+--------+
|user_id|user_name|user_age|
+-------+---------+--------+
|      1|     John|      25|
|      2|     Jane|      30|
|      3|     Mike|      35|
|      4|    Emily|      28|
|      5|    David|      32|
+-------+---------+--------+



### lit and concat

In [0]:
from pyspark.sql.functions import lit, col, concat

df.withColumn('literal_value', lit(7)).show()

df.withColumn('name_and_occupation', concat('name', lit(', '), 'occupation')).show()

+-------+-----+---+------+-------------+-----------------+----------+------------+-------------------+---------------+-------------+
|user_id| name|age|gender|         city|       occupation| education|weight_in_kg|      time_of_birth|date_of_joining|literal_value|
+-------+-----+---+------+-------------+-----------------+----------+------------+-------------------+---------------+-------------+
|      1| John| 25|  Male|     New York|         Engineer|Bachelor's|        70.5|1995-05-10 08:30:00|     2020-01-01|            7|
|      2| Jane| 30|Female|San Francisco|           Doctor|  Master's|        65.2|1990-08-15 12:00:00|     2018-06-15|            7|
|      3| Mike| 35|  Male|      Chicago|          Teacher|       PhD|        80.7|1985-03-20 10:45:00|     2015-03-01|            7|
|      4|Emily| 28|Female|  Los Angeles|           Lawyer|Bachelor's|        60.9|1992-11-05 09:15:00|     2019-09-10|            7|
|      5|David| 32|  Male|      Seattle|Software Engineer|  Master's|