# Install Java, Spark, and Findspark
This installs Apache Spark 2.3.2, Java 8, and [Findspark](https://github.com/minrk/findspark), a library that makes it easy for Python to find Spark.

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [4]:
!pip install py4j

Collecting py4j
[?25l  Downloading https://files.pythonhosted.org/packages/04/de/2d314a921ef4c20b283e1de94e0780273678caac901564df06b948e4ba9b/py4j-0.10.8.1-py2.py3-none-any.whl (196kB)
[K     |█▊                              | 10kB 18.1MB/s eta 0:00:01[K     |███▍                            | 20kB 2.2MB/s eta 0:00:01[K     |█████                           | 30kB 3.2MB/s eta 0:00:01[K     |██████▊                         | 40kB 2.1MB/s eta 0:00:01[K     |████████▍                       | 51kB 2.6MB/s eta 0:00:01[K     |██████████                      | 61kB 3.1MB/s eta 0:00:01[K     |███████████▊                    | 71kB 3.6MB/s eta 0:00:01[K     |█████████████▍                  | 81kB 4.0MB/s eta 0:00:01[K     |███████████████                 | 92kB 4.5MB/s eta 0:00:01[K     |████████████████▊               | 102kB 3.4MB/s eta 0:00:01[K     |██████████████████▍             | 112kB 3.4MB/s eta 0:00:01[K     |████████████████████            | 122kB 3.4MB/s eta 0

# Set Environment Variables
Set the locations where Spark and Java are installed.

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

# Start a SparkSession
This will start a local Spark session.

In [0]:
import findspark
findspark.init("spark-2.4.4-bin-hadoop2.7")# SPARK_HOME

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Use Spark!
That's all there is to it - you're ready to use Spark!

In [7]:
df = spark.createDataFrame([{"hello": "world"} for x in range(1000)])
df.show(3)



+-----+
|hello|
+-----+
|world|
|world|
|world|
+-----+
only showing top 3 rows



## Let’s Manipulate some predefined Google Collab Sample_Data


In [9]:
os.listdir('./sample_data')

['README.md',
 'anscombe.json',
 'california_housing_train.csv',
 'mnist_test.csv',
 'mnist_train_small.csv',
 'california_housing_test.csv']

In [0]:
file_loc = './sample_data/california_housing_train.csv'

In [0]:
df_spark = spark.read.csv(file_loc, inferSchema=True, header=True)

In [12]:
print(type(df_spark))

<class 'pyspark.sql.dataframe.DataFrame'>


In [13]:
df_spark.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)



In [14]:
df_spark.show()# show top 20 rows

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -114.31|   34.19|              15.0|     5612.0|        1283.0|    1015.0|     472.0|       1.4936|           66900.0|
|  -114.47|    34.4|              19.0|     7650.0|        1901.0|    1129.0|     463.0|         1.82|           80100.0|
|  -114.56|   33.69|              17.0|      720.0|         174.0|     333.0|     117.0|       1.6509|           85700.0|
|  -114.57|   33.64|              14.0|     1501.0|         337.0|     515.0|     226.0|       3.1917|           73400.0|
|  -114.57|   33.57|              20.0|     1454.0|         326.0|     624.0|     262.0|        1.925|           65500.0|
|  -114.58|   33.63|    