In [24]:
# Section must be included at the beginning of each new notebook. Remember to change the app name.
# If you're using VirtualBox, change the below to '/home/user/spark-2.1.1-bin-hadoop2.7'
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import *
spark = SparkSession.builder.appName('logistic_regression_adv').getOrCreate()

# If you're getting an error with numpy, please type 'sudo pip3 install numpy --user' into the console.
# If you're getting an error with another package, type 'sudo pip3 install PACKAGENAME --user'. 
# Replace PACKAGENAME with the relevant package (such as pandas, etc).
from pyspark.ml.classification import LogisticRegression

# Import data and print schema - columns is another way to view the data's features.
df = spark.read.csv('Datasets/IVS_Country.csv', header=True, inferSchema=True)
df.printSchema()
print(df.columns)

root
 |-- Pkey: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- Year ending: string (nullable = true)
 |-- Airport of departure: string (nullable = true)
 |-- Purpose of visit: string (nullable = true)
 |-- Country of permanent residence: string (nullable = true)
 |-- Total visitor spend: double (nullable = true)
 |-- Total visitors: integer (nullable = true)

['Pkey', 'Year', 'Quarter', 'Year ending', 'Airport of departure', 'Purpose of visit', 'Country of permanent residence', 'Total visitor spend', 'Total visitors']


In [25]:
# Import pandas.
import pandas as pd

# Take the first five rows of data, and visualise.
pd.DataFrame(df.take(5), columns=df.columns)

Unnamed: 0,Pkey,Year,Quarter,Year ending,Airport of departure,Purpose of visit,Country of permanent residence,Total visitor spend,Total visitors
0,1,1997,4,YEDec 1997,Auckland,Business,Africa and Middle East,4266524.0,1684
1,2,1997,4,YEDec 1997,Auckland,Business,Australia,132588500.0,67277
2,3,1997,4,YEDec 1997,Auckland,Business,Canada,15589690.0,3596
3,4,1997,4,YEDec 1997,Auckland,Business,China,9490118.0,4510
4,5,1997,4,YEDec 1997,Auckland,Business,Germany,5481106.0,1889


In [26]:
# To visualise the first five columns, simply add transpose. 
pd.DataFrame(df.take(5), columns=df.columns).transpose()

Unnamed: 0,0,1,2,3,4
Pkey,1,2,3,4,5
Year,1997,1997,1997,1997,1997
Quarter,4,4,4,4,4
Year ending,YEDec 1997,YEDec 1997,YEDec 1997,YEDec 1997,YEDec 1997
Airport of departure,Auckland,Auckland,Auckland,Auckland,Auckland
Purpose of visit,Business,Business,Business,Business,Business
Country of permanent residence,Africa and Middle East,Australia,Canada,China,Germany
Total visitor spend,4.26652e+06,1.32589e+08,1.55897e+07,9.49012e+06,5.48111e+06
Total visitors,1684,67277,3596,4510,1889


In [27]:
# Oops! Forgot to assign the result to a variable. Let's try that again. Now you can see that records with null values have been removed.
df = df.na.drop()
df.show()
print("Total data points:", df.count())

+----+----+-------+-----------+--------------------+------------------+------------------------------+-------------------+--------------+
|Pkey|Year|Quarter|Year ending|Airport of departure|  Purpose of visit|Country of permanent residence|Total visitor spend|Total visitors|
+----+----+-------+-----------+--------------------+------------------+------------------------------+-------------------+--------------+
|   1|1997|      4| YEDec 1997|            Auckland|          Business|          Africa and Middle...|         4266524.07|          1684|
|   2|1997|      4| YEDec 1997|            Auckland|          Business|                     Australia|      1.325885058E8|         67277|
|   3|1997|      4| YEDec 1997|            Auckland|          Business|                        Canada|      1.558969421E7|          3596|
|   4|1997|      4| YEDec 1997|            Auckland|          Business|                         China|         9490117.52|          4510|
|   5|1997|      4| YEDec 1997|   

In [28]:
# We can use group by and count to find out how many data points we have for each class in our predictor. 
df.groupby('year').count().toPandas()

Unnamed: 0,year,count
0,2003,530
1,2007,568
2,2018,155
3,2015,603
4,2006,565
5,2013,587
6,1997,116
7,2014,605
8,2004,554
9,1998,472


In [29]:
# Using a for loop to find all columns that belong to the integer data type. 
numeric_features = [t[0] for t in df.dtypes if t[1] == 'int']

# Selecting the numeric features, generating summary statistics, and converting to a Pandas DataFrame.
df.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Pkey,11292,8609.705897980872,4917.03995600964,1,17056
Year,11292,2007.9382748848743,5.880289393162935,1997,2018
Quarter,11292,2.4991144172865747,1.1292752985276613,1,4
Total visitors,11292,15576.741232731138,33229.303734004156,1,314584


In [30]:
# Using a for loop to find all columns that belong to the integer data type. 
numeric_features = [t[0] for t in df.dtypes if t[1] == 'double']

# Selecting the numeric features, generating summary statistics, and converting to a Pandas DataFrame.
df.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Total visitor spend,11292,5.09246119696264E7,1.0193863986366412E8,2.4,1.235333092E9


In [31]:
# Import the relevant packages.
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)

# First create a string indexer which converts every string into a number
# A number will be assigned to every category in the column.
yearending_indexer = StringIndexer(inputCol='Year ending',outputCol='yearendingIndex')
airport_indexer = StringIndexer(inputCol='Airport of departure',outputCol='airportIndex')
purpose_indexer = StringIndexer(inputCol='Purpose of visit',outputCol='purposeIndex')
country_indexer = StringIndexer(inputCol='Country of permanent residence',outputCol='label')


In [32]:
# Now we can one hot encode these numbers. This converts the various outputs into a single vector.
# Multiple columns are collapsed into one. 
# This makes it easier to process when you have multiple classes.
yearending_encoder = OneHotEncoder(inputCol='yearendingIndex',outputCol='yearendingVec')
airport_encoder = OneHotEncoder(inputCol='airportIndex',outputCol='airportVec')
purpose_encoder = OneHotEncoder(inputCol='purposeIndex',outputCol='purposeVec')


In [33]:
# And finally, using vector assembler to turn all of these columns into one column (named features).
assembler = VectorAssembler(inputCols=['yearendingVec', 'airportVec', 'purposeVec', 'Pkey', 'Year', 'Quarter', 'Total visitor spend', 'Total visitors'], outputCol="features")

In [34]:
from pyspark.ml import Pipeline

# Then go through our steps. It's essentially sequential to the above.
pipeline = Pipeline(stages=[yearending_indexer, airport_indexer, purpose_indexer, country_indexer, yearending_encoder, airport_encoder, purpose_encoder, assembler])

print("Total data points:", df.count())

# Now that we've got a number of steps, let's apply it to the DataFrame.
pipeline_model = pipeline.fit(df)

# Incorporate results into a new DataFrame.
pipe_df = pipeline_model.transform(df)

# Remove all variables other than features and label. 
pipe_df = pipe_df.select('label', 'features')




Total data points: 11292


In [50]:

from pyspark.ml.classification import LogisticRegression

# Split our data. Note that the new DataFrame is being used.
trainlogic_data, testlogic_data = pipe_df.randomSplit([0.7,0.3])
print("Training Dataset Count: " + str(trainlogic_data.count()))
print("Test Dataset Count: " + str(testlogic_data.count()))

# Instantiate the model.
lr_model = LogisticRegression(featuresCol='features',labelCol='label')

# Fit the model.
lr_model = lr_model.fit(trainlogic_data)

# And evaluate the model using the test data.
results = lr_model.transform(testlogic_data)


Training Dataset Count: 7969
Test Dataset Count: 3323


In [51]:

import matplotlib.pyplot as plt
import numpy as np

# Visualising the coefficients. Sort from lowest to highest.
#beta = np.sort(lr_model.coefficients)
print("Multinomial coefficients: " + str(lr_model.coefficientMatrix))
print("Multinomial intercepts: " + str(lr_model.interceptVector))

Multinomial coefficients: DenseMatrix([[ 1.34605841e-02, -1.87112935e-01,  6.98234417e-02, ...,
               4.33177731e-02, -3.33821117e-09,  1.19897710e-04],
             [-2.20190559e-02,  3.86794607e-02,  8.21975429e-02, ...,
               2.73056507e-02,  1.14097540e-08,  1.04926053e-05],
             [-2.04581281e-03, -3.76980286e-02,  2.32455909e-02, ...,
               1.31732736e-02,  9.31531801e-09, -4.90631556e-06],
             ...,
             [-3.67164582e-02, -1.32387847e-01, -5.82550505e-02, ...,
              -3.84096080e-02, -4.49851632e-09,  4.25495444e-06],
             [ 3.10455903e-02, -7.36891422e-02, -1.42915820e+00, ...,
              -1.73032350e-02, -7.89655312e-09, -1.70149376e-05],
             [ 3.13052764e-02, -3.10705873e-02,  6.54147132e-02, ...,
               3.45376590e-02, -1.24958025e-08, -3.58112604e-05]])
Multinomial intercepts: [0.19877070728797747,0.181431810929206,0.16880080042183698,0.15044611478986847,0.15466439102376311,0.08388369412746