In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Clustering').getOrCreate()

In [2]:
from pyspark.ml.clustering import KMeans


In [3]:
df = spark.read.csv("My datasets/Mergedata1_bdas.csv",inferSchema=True,header=True)

In [4]:
print(df.head())


Row(Country='HUN', Sex='MEN', Year=2013, Parttime_rate=-1.261207052, Selfemp_rate=-0.413181977, GDP Value=-0.463960339, Export=1, Import=0, Cost=-0.566249253)


In [5]:
df.printSchema()


root
 |-- Country: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Parttime_rate: double (nullable = true)
 |-- Selfemp_rate: double (nullable = true)
 |-- GDP Value: double (nullable = true)
 |-- Export: integer (nullable = true)
 |-- Import: integer (nullable = true)
 |-- Cost: double (nullable = true)



In [6]:
import pandas as pd
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Country,318,,,AUS,TUR
Sex,318,,,MEN,WOMEN
Year,318,2014.4968553459119,1.1141431690277472,2013,2016
Parttime_rate,318,0.007965305657232694,0.941164064527062,-1.348200942,4.040367459
Selfemp_rate,318,-0.036324366216981124,0.9591783853040566,-1.355954323,3.507917323
GDP Value,318,0.004000497820754719,1.0119695213257136,-0.517832601,5.510229845
Export,318,0.5157232704402516,0.5005403504434763,0,1
Import,318,0.48427672955974843,0.5005403504434764,0,1
Cost,318,-0.011574737371069163,0.9785401675065469,-0.738233521,4.813713154


In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
vector_assembler = VectorAssembler(inputCols = ['Parttime_rate', 'Selfemp_rate', 'GDP Value', 'Export', 'Import', 'Cost'], outputCol = 'features')

In [9]:
# Now that we've created the assembler variable, let's actually transform the data.
vector_output = vector_assembler.transform(df)

# Using print schema, you see that the features output column has been added. 
vector_output.printSchema()

# You can see that the features column is a DenseVector that combines the various features as expected.
vector_output.head(1)

root
 |-- Country: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Parttime_rate: double (nullable = true)
 |-- Selfemp_rate: double (nullable = true)
 |-- GDP Value: double (nullable = true)
 |-- Export: integer (nullable = true)
 |-- Import: integer (nullable = true)
 |-- Cost: double (nullable = true)
 |-- features: vector (nullable = true)



[Row(Country='HUN', Sex='MEN', Year=2013, Parttime_rate=-1.261207052, Selfemp_rate=-0.413181977, GDP Value=-0.463960339, Export=1, Import=0, Cost=-0.566249253, features=DenseVector([-1.2612, -0.4132, -0.464, 1.0, 0.0, -0.5662]))]