# spark environment and starting the notebook


    tuhinm$ export SPARK_HOME=/usr/local/Cellar/apache-spark/2.2.0/libexec
    
    tuhinm$ export PYSPARK_DRIVER_PYTHON_OPTS=notebook
    
    tuhinm$ pyspark

### open browser with the following:
   http://localhost:8889/?token=31785c783bbad64c474133b9332cb8277ed60ac188b89f74


In [3]:
# Configure the necessary Spark environment
import os
import sys

# Spark home
spark_home = os.environ.get("SPARK_HOME")

# If Spark V1.4.x is detected, then add ' pyspark-shell' to
# the end of the 'PYSPARK_SUBMIT_ARGS' environment variable
spark_release_file = spark_home + "/RELEASE"
if os.path.exists(spark_release_file) and "Spark 1.4" in open(spark_release_file).read():
    pyspark_submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "")
    if not "pyspark-shell" in pyspark_submit_args: pyspark_submit_args += " pyspark-shell"
    os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

# Add the spark python sub-directory to the path
sys.path.insert(0, spark_home + "/python")

# Add the py4j to the path.
# You may need to change the version number to match your install
sys.path.insert(0, os.path.join(spark_home, "python/lib/py4j-0.8.2.1-src.zip"))

# Initialize PySpark to predefine the SparkContext variable 'sc'
execfile(os.path.join(spark_home, "python/pyspark/shell.py"))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.2.0
      /_/

Using Python version 2.7.13 (default, Dec 20 2016 23:05:08)
SparkSession available as 'spark'.


In [4]:
#https://github.com/6chaoran/DataStory/blob/master/Titanic-Spark/pyspark-script.py
train_path='/Users/tuhinm/Downloads/DataStory/Titanic-Spark/train.csv'
test_path='/Users/tuhinm/Downloads/DataStory/Titanic-Spark/test.csv'

# Load csv file as RDD
#train_rdd = sc.textFile(train_path)
#test_rdd = sc.textFile(test_path)

train_df = sqlContext.read.format('com.databricks.spark.csv') \
     .options(header='true', inferschema='true').load(train_path)
test_df = sqlContext.read.format('com.databricks.spark.csv') \
     .options(header='true', inferschema='true').load(test_path)

In [5]:
## Add Survived column to test
## And append train/test data
from pyspark.sql.functions import lit, col

In [6]:
## Add Survived column to test
## And append train/test data
from pyspark.sql.functions import lit, col
train_df = train_df.withColumn('Mark',lit('train'))
test_df = (test_df.withColumn('Survived',lit(0))
				  .withColumn('Mark',lit('test')))

test_df = test_df[train_df.columns]
df = train_df.unionAll(test_df)

In [7]:
df.toPandas().head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Mark
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train


In [8]:
## Data Cleaning/Manipulation
## Convert Age, SibSp, Parch, Fare to Numeric
df = (df.withColumn('Age',df['Age'].cast("double"))
			.withColumn('SibSp',df['SibSp'].cast("double"))
			.withColumn('Parch',df['Parch'].cast("double"))
			.withColumn('Fare',df['Fare'].cast("double"))
			.withColumn('Survived',df['Survived'].cast("double"))
			)

df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: double (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: double (nullable = true)
 |-- Parch: double (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Mark: string (nullable = false)



In [9]:
## Impute missing Age and Fare
numVars = ['Survived','Age','SibSp','Parch','Fare']
def countNull(df,var):
	return df.where(df[var].isNull()).count()

missing = {var: countNull(df,var) for var in numVars}
missing

{'Age': 263, 'Fare': 1, 'Parch': 0, 'SibSp': 0, 'Survived': 0}

In [10]:
age_mean = df.groupBy().mean('Age').first()[0]
fare_mean = df.groupBy().mean('Fare').first()[0]
df = df.na.fill({'Age':age_mean,'Fare':fare_mean})

In [11]:
# Impute missing Age and Fare
numVars = ['Survived','Age','SibSp','Parch','Fare']
def countNull(df,var):
	return df.where(df[var].isNull()).count()

missing = {var: countNull(df,var) for var in numVars}
missing

{'Age': 0, 'Fare': 0, 'Parch': 0, 'SibSp': 0, 'Survived': 0}

In [12]:
df.toPandas().head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Mark
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S,train
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C,train
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S,train
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S,train
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,,S,train


In [13]:
# Feature Enginnering
## 1. Extract Title from Name
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import DoubleType

## created user defined function to extract title
getTitle = udf(lambda name: name.split(',')[1].split('.')[0].strip(),StringType())
df = df.withColumn('Title', getTitle(df['Name']))

## created userd defined function to extract Cabin
getCabin = udf(lambda name: 1.0 if name!=None else 0.0,DoubleType())
df = df.withColumn('hasCabin', getCabin(df['Cabin']))

In [14]:
df.toPandas().head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Mark,Title,hasCabin
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S,train,Mr,0.0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C,train,Mrs,1.0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S,train,Miss,0.0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S,train,Mrs,1.0
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,,S,train,Mr,0.0


In [15]:
##  hadling categorical variable using StringIndexer
from pyspark.ml.feature import StringIndexer
si = StringIndexer(inputCol = 'Sex', outputCol = 'Sex_indexed')
df = si.fit(df).transform(df).drop('Sex').withColumnRenamed('Sex_indexed','Sex')

si = StringIndexer(inputCol = 'Title', outputCol = 'Title_indexed')
df = si.fit(df).transform(df).drop('Title').withColumnRenamed('Title_indexed','Title')

In [16]:
df.toPandas().head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Mark,hasCabin,Sex,Title
0,1,0.0,3,"Braund, Mr. Owen Harris",22.0,1.0,0.0,A/5 21171,7.25,,S,train,0.0,0.0,0.0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1.0,0.0,PC 17599,71.2833,C85,C,train,1.0,1.0,2.0
2,3,1.0,3,"Heikkinen, Miss. Laina",26.0,0.0,0.0,STON/O2. 3101282,7.925,,S,train,0.0,1.0,1.0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1.0,0.0,113803,53.1,C123,S,train,1.0,1.0,2.0
4,5,0.0,3,"Allen, Mr. William Henry",35.0,0.0,0.0,373450,8.05,,S,train,0.0,0.0,0.0


In [17]:
numVars = ['Survived','Age','SibSp','Parch','Fare','Embarked']
missing = {var: countNull(df,var) for var in numVars}
missing

{'Age': 0, 'Embarked': 2, 'Fare': 0, 'Parch': 0, 'SibSp': 0, 'Survived': 0}

In [18]:
df.count()

1309

In [19]:
df=df.na.drop(subset=["Embarked"])

In [20]:
df.count()

1307

In [21]:
si = StringIndexer(inputCol = 'Embarked', outputCol = 'Embarked_indexed')
#df = si.fit(df).transform(df).drop('Embarked').withColumnRenamed('Embarked_indexed','Embarked')
df = si.fit(df).transform(df).drop('Embarked').withColumnRenamed('Embarked_indexed','Embarked')

In [22]:
df.toPandas().head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Mark,hasCabin,Sex,Title,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",22.0,1.0,0.0,A/5 21171,7.25,,train,0.0,0.0,0.0,0.0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1.0,0.0,PC 17599,71.2833,C85,train,1.0,1.0,2.0,1.0
2,3,1.0,3,"Heikkinen, Miss. Laina",26.0,0.0,0.0,STON/O2. 3101282,7.925,,train,0.0,1.0,1.0,0.0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1.0,0.0,113803,53.1,C123,train,1.0,1.0,2.0,0.0
4,5,0.0,3,"Allen, Mr. William Henry",35.0,0.0,0.0,373450,8.05,,train,0.0,0.0,0.0,0.0


In [23]:
df=df.select('Survived','Pclass','Age','SibSp','Parch','Fare','hasCabin','Sex','Title','Mark')

In [24]:
df.toPandas().head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,hasCabin,Sex,Title,Mark
0,0.0,3,22.0,1.0,0.0,7.25,0.0,0.0,0.0,train
1,1.0,1,38.0,1.0,0.0,71.2833,1.0,1.0,2.0,train
2,1.0,3,26.0,0.0,0.0,7.925,0.0,1.0,1.0,train
3,1.0,1,35.0,1.0,0.0,53.1,1.0,1.0,2.0,train
4,0.0,3,35.0,0.0,0.0,8.05,0.0,0.0,0.0,train


In [25]:


# split back train/test data
train = df.where(df.Mark =='train').drop('Mark')
test = df.where(df.Mark =='test').drop('Mark')

# random split further to get train/validate
train,validate = train.randomSplit([0.7,0.3],seed =121)

print 'Train Data Number of Row: '+ str(train.count())
print 'Validate Data Number of Row: '+ str(validate.count())
print 'Test Data Number of Row: '+ str(test.count())


Train Data Number of Row: 636
Validate Data Number of Row: 253
Test Data Number of Row: 418


In [26]:
train.toPandas().head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,hasCabin,Sex,Title
0,0.0,1,2.0,1.0,2.0,151.55,1.0,1.0,1.0
1,0.0,1,18.0,1.0,0.0,108.9,1.0,0.0,0.0
2,0.0,1,19.0,1.0,0.0,53.1,1.0,0.0,0.0
3,0.0,1,21.0,0.0,1.0,77.2875,1.0,0.0,0.0
4,0.0,1,24.0,0.0,0.0,79.2,1.0,0.0,0.0


In [27]:
train.count()

636

In [28]:
train.printSchema()

root
 |-- Survived: double (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Age: double (nullable = false)
 |-- SibSp: double (nullable = true)
 |-- Parch: double (nullable = true)
 |-- Fare: double (nullable = false)
 |-- hasCabin: double (nullable = true)
 |-- Sex: double (nullable = true)
 |-- Title: double (nullable = true)



### Convert to Labeledpoint Rdd

In [29]:
import pyspark.mllib.regression as reg
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=['Pclass','Age','SibSp','Parch','Fare','hasCabin','Sex','Title'],
    outputCol="features")

train_l = assembler.transform(train)


In [30]:
train_l.toPandas().head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,hasCabin,Sex,Title,features
0,0.0,1,2.0,1.0,2.0,151.55,1.0,1.0,1.0,"[1.0, 2.0, 1.0, 2.0, 151.55, 1.0, 1.0, 1.0]"
1,0.0,1,18.0,1.0,0.0,108.9,1.0,0.0,0.0,"[1.0, 18.0, 1.0, 0.0, 108.9, 1.0, 0.0, 0.0]"
2,0.0,1,19.0,1.0,0.0,53.1,1.0,0.0,0.0,"[1.0, 19.0, 1.0, 0.0, 53.1, 1.0, 0.0, 0.0]"
3,0.0,1,21.0,0.0,1.0,77.2875,1.0,0.0,0.0,"[1.0, 21.0, 0.0, 1.0, 77.2875, 1.0, 0.0, 0.0]"
4,0.0,1,24.0,0.0,0.0,79.2,1.0,0.0,0.0,"(1.0, 24.0, 0.0, 0.0, 79.2, 1.0, 0.0, 0.0)"


In [31]:
train_l=train_l.select(col("Survived").alias("label"), col("features"))

In [32]:
train_l.toPandas().head(10)

Unnamed: 0,label,features
0,0.0,"[1.0, 2.0, 1.0, 2.0, 151.55, 1.0, 1.0, 1.0]"
1,0.0,"[1.0, 18.0, 1.0, 0.0, 108.9, 1.0, 0.0, 0.0]"
2,0.0,"[1.0, 19.0, 1.0, 0.0, 53.1, 1.0, 0.0, 0.0]"
3,0.0,"[1.0, 21.0, 0.0, 1.0, 77.2875, 1.0, 0.0, 0.0]"
4,0.0,"(1.0, 24.0, 0.0, 0.0, 79.2, 1.0, 0.0, 0.0)"
5,0.0,"[1.0, 24.0, 0.0, 1.0, 247.5208, 1.0, 0.0, 0.0]"
6,0.0,"[1.0, 25.0, 1.0, 2.0, 151.55, 1.0, 1.0, 2.0]"
7,0.0,"(1.0, 28.0, 1.0, 0.0, 82.1708, 0.0, 0.0, 0.0)"
8,0.0,"(1.0, 29.0, 0.0, 0.0, 30.0, 1.0, 0.0, 0.0)"
9,0.0,"[1.0, 29.0, 1.0, 0.0, 66.6, 1.0, 0.0, 0.0]"


In [None]:
dt = DecisionTreeClassifier(maxDepth=2, labelCol="label")
model = dt.fit(df)

In [35]:
print(model.toDebugString())

TypeError: 'unicode' object is not callable