## Type of variable `spark`

In [0]:
type(spark)

pyspark.sql.session.SparkSession

In [0]:
help(spark.createDataFrame)

Help on method createDataFrame in module pyspark.sql.session:

createDataFrame(data: Union[pyspark.rdd.RDD[Any], Iterable[Any], ForwardRef('PandasDataFrameLike'), ForwardRef('ArrayLike')], schema: Union[pyspark.sql.types.AtomicType, pyspark.sql.types.StructType, str, NoneType] = None, samplingRatio: Optional[float] = None, verifySchema: bool = True) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.session.SparkSession instance
    Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`
    or a :class:`numpy.ndarray`.
    
    .. versionadded:: 2.0.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    data : :class:`RDD` or iterable
        an RDD of any kind of SQL data representation (:class:`Row`,
        :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`,
        :class:`pandas.DataFrame` or :class:`numpy.ndarray`.
    schema : :class:`pyspark.sql.types.DataType`, str or list, op

In [0]:
df = spark.createDataFrame([1,2,3], 'int')

In [0]:
df.show()

+-----+
|value|
+-----+
|    1|
|    2|
|    3|
+-----+



In [0]:
from pyspark.sql.types import StringType
df = spark.createDataFrame([1,2,3], StringType())
df.show()

+-----+
|value|
+-----+
|    1|
|    2|
|    3|
+-----+



## Creating a dataframe using a list of tuples

In [0]:
users = [(1, 'Tanmay'), (2, 'Roger'), (3, 'David'), (4, 'Emily')]
df = spark.createDataFrame(users, 'user_id int, user_name string')
df.show()

+-------+---------+
|user_id|user_name|
+-------+---------+
|      1|   Tanmay|
|      2|    Roger|
|      3|    David|
|      4|    Emily|
+-------+---------+



In [0]:
df.collect(), type(df.collect()[1])

([Row(user_id=1, user_name='Tanmay'),
  Row(user_id=2, user_name='Roger'),
  Row(user_id=3, user_name='David'),
  Row(user_id=4, user_name='Emily')],
 pyspark.sql.types.Row)

In [0]:
from pyspark.sql import Row

In [0]:
help(Row)

Help on class Row in module pyspark.sql.types:

class Row(builtins.tuple)
 |  Row(*args: Optional[str], **kwargs: Optional[Any]) -> 'Row'
 |  
 |  A row in :class:`DataFrame`.
 |  The fields in it can be accessed:
 |  
 |  * like attributes (``row.key``)
 |  * like dictionary values (``row[key]``)
 |  
 |  ``key in row`` will search through row keys.
 |  
 |  Row can be used to create a row object by using named arguments.
 |  It is not allowed to omit a named argument to represent that the value is
 |  None or missing. This should be explicitly set to None in this case.
 |  
 |  .. versionchanged:: 3.0.0
 |      Rows created from named arguments no longer have
 |      field names sorted alphabetically and will be ordered in the position as
 |      entered.
 |  
 |  Examples
 |  --------
 |  >>> from pyspark.sql import Row
 |  >>> row = Row(name="Alice", age=11)
 |  >>> row
 |  Row(name='Alice', age=11)
 |  >>> row['name'], row['age']
 |  ('Alice', 11)
 |  >>> row.name, row.age
 |  ('A

In [0]:
row1 = Row(name='Tanmay', age=31)

In [0]:
row1['name'], row1.name

('Tanmay', 'Tanmay')

## Creating a dataframe using a list of lists

In [0]:
users = [[1, 'Tanmay'], [2, 'Roger'], [3, 'Scott'], [4, 'Phil'], [5, 'Sarah']]

from pyspark.sql import Row

users_as_Row = [Row(*user) for user in users]

df = spark.createDataFrame(users_as_Row, 'user_id int, user_name string')

df.show()

+-------+---------+
|user_id|user_name|
+-------+---------+
|      1|   Tanmay|
|      2|    Roger|
|      3|    Scott|
|      4|     Phil|
|      5|    Sarah|
+-------+---------+

