In [1]:
import numpy as np
import pandas as pd
import pyspark
import urllib

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.feature import *

In [2]:
spark = SparkSession.builder.appName('example').getOrCreate()

In [None]:
import urllib
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
urllib.request.urlretrieve(URL, "iris.csv");

In [None]:
columns = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']

# inferSchema reads the file twice, but detects numerical columns
data = spark.read.csv('iris.csv', header=False, inferSchema=True)
data = data.toDF(*columns)
data.printSchema()

In [None]:
data.schema

## Create DataFrame directly

In [3]:
data = [
    ('James','','Smith','1991-04-01','M',3000),
    ('Michael','Rose','','2000-05-19','M',4000),
    ('Robert','','Williams','1978-09-05','M',4000),
    ('Maria','Anne','Jones','1967-12-01','F',4000),
    ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]

df = spark.createDataFrame(data, schema = columns)

## Create DataFrame from Pandas

In [4]:
pandas_df = pd.DataFrame(data)
pandas_df

Unnamed: 0,0,1,2,3,4,5
0,James,,Smith,1991-04-01,M,3000
1,Michael,Rose,,2000-05-19,M,4000
2,Robert,,Williams,1978-09-05,M,4000
3,Maria,Anne,Jones,1967-12-01,F,4000
4,Jen,Mary,Brown,1980-02-17,F,-1


In [5]:
new_df = spark.createDataFrame(pandas_df)
new_df.show()

+-------+----+--------+----------+---+----+
|      0|   1|       2|         3|  4|   5|
+-------+----+--------+----------+---+----+
|  James|    |   Smith|1991-04-01|  M|3000|
|Michael|Rose|        |2000-05-19|  M|4000|
| Robert|    |Williams|1978-09-05|  M|4000|
|  Maria|Anne|   Jones|1967-12-01|  F|4000|
|    Jen|Mary|   Brown|1980-02-17|  F|  -1|
+-------+----+--------+----------+---+----+



## Exercises

In [None]:
# iterate over lines of the dataframe, printing them out:

for line in df.head(5):
    print(line)

In [None]:
# print out summary statistics for each column?

In [7]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

