CMD : pyspark --master local[2]

In [1]:
from pyspark import SparkContext

sc = SparkContext.getOrCreate()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
       .master("local") \
       .appName("Athulspark") \
       .config("spark.some.config.option", "some-value") \
       .getOrCreate()

In [3]:
df = spark.read.csv("athul_test.csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- rotationRate: double (nullable = true)
 |-- userAcceleration: double (nullable = true)
 |-- act: double (nullable = true)
 |-- id: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- height: double (nullable = true)
 |-- age: double (nullable = true)
 |-- gender: double (nullable = true)
 |-- trial: double (nullable = true)



In [4]:
df.show(5)

+---+--------------------+--------------------+---+---+------+------+----+------+-----+
|_c0|        rotationRate|    userAcceleration|act| id|weight|height| age|gender|trial|
+---+--------------------+--------------------+---+---+------+------+----+------+-----+
|  0|0.010253424306055027|0.006959199379238966|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
|  1|0.010920351047470954|0.010672920359489243|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
|  2|0.008376644793710666|0.007009658764875...|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
|  3|0.006554577255628314|0.014892331247994722|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
|  4|0.007723848846268292|0.013001225519157802|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
+---+--------------------+--------------------+---+---+------+------+----+------+-----+
only showing top 5 rows



In [5]:
df.count()

830895

In [6]:
df.describe().show()

+-------+------------------+--------------------+--------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+
|summary|               _c0|        rotationRate|    userAcceleration|               act|                id|            weight|           height|               age|            gender|             trial|
+-------+------------------+--------------------+--------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+
|  count|            830895|              830895|              830895|            830895|            830895|            830895|           830895|            830895|            830895|            830895|
|   mean|          415447.0|  1.0972969464544526|  0.3986288622062738|1.9931940858953296|11.527124365894608| 71.96852069154346|173.9261386817829|  28.7999386204033|0.5720674694155098| 5.77

In [7]:
df.select("age", "rotationRate","userAcceleration","act").show(5)

+----+--------------------+--------------------+---+
| age|        rotationRate|    userAcceleration|act|
+----+--------------------+--------------------+---+
|46.0|0.010253424306055027|0.006959199379238966|0.0|
|46.0|0.010920351047470954|0.010672920359489243|0.0|
|46.0|0.008376644793710666|0.007009658764875...|0.0|
|46.0|0.006554577255628314|0.014892331247994722|0.0|
|46.0|0.007723848846268292|0.013001225519157802|0.0|
+----+--------------------+--------------------+---+
only showing top 5 rows



In [8]:
df.select("act").distinct().show()

+---+
|act|
+---+
|0.0|
|1.0|
|4.0|
|3.0|
|2.0|
|5.0|
+---+



In [9]:
from pyspark.sql import functions as F

df.groupBy("act").agg(F.sum("gender")).show()

+---+-----------+
|act|sum(gender)|
+---+-----------+
|0.0|   133900.0|
|1.0|   127188.0|
|4.0|    94140.0|
|3.0|    32112.0|
|2.0|    27707.0|
|5.0|    60281.0|
+---+-----------+



In [10]:
df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns]).show()

+---+------------+----------------+---+---+------+------+---+------+-----+
|_c0|rotationRate|userAcceleration|act| id|weight|height|age|gender|trial|
+---+------------+----------------+---+---+------+------+---+------+-----+
|  0|           0|               0|  0|  0|     0|     0|  0|     0|    0|
+---+------------+----------------+---+---+------+------+---+------+-----+



No Missing values

In [11]:
# df.write.csv("training_data.csv)

In [12]:
df.rdd.getNumPartitions()

2

## MLlib

In [13]:
numeric_features = [t[0] for t in df.dtypes if t[1] == 'int']
df.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
_c0,830895,415447.0,239858.87029668092,0,830894


In [14]:
df = df.select( "age", "gender", "weight", "height", "rotationRate", "userAcceleration", "act")

cols = df.columns
df.printSchema()

root
 |-- age: double (nullable = true)
 |-- gender: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- height: double (nullable = true)
 |-- rotationRate: double (nullable = true)
 |-- userAcceleration: double (nullable = true)
 |-- act: double (nullable = true)



In [15]:
train, test = df.randomSplit([0.7, 0.3], seed = 2018)

print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 581828
Test Dataset Count: 249067


In [16]:
from pyspark.ml.feature import VectorAssembler

inputCols = train.columns[0: len (train.columns) - 1]

vecAssembler = VectorAssembler(\
                               inputCols = inputCols, \
                               outputCol = "features") \
                              .setHandleInvalid("skip")

In [17]:
vecTrain = vecAssembler.transform(train)
vecTrain.select ("features").show(5, False)

+---------------------------------------------------------------+
|features                                                       |
+---------------------------------------------------------------+
|[23.0,0.0,48.0,164.0,4.632547895057319E-4,0.028051240346908014]|
|[23.0,0.0,48.0,164.0,4.632547895057319E-4,0.030473101023689726]|
|[23.0,0.0,48.0,164.0,4.771215777975253E-4,0.043320736778129705]|
|[23.0,0.0,48.0,164.0,4.822841486095101E-4,0.012493412704301415]|
|[23.0,0.0,48.0,164.0,5.073795423546361E-4,0.01697099693594929] |
+---------------------------------------------------------------+
only showing top 5 rows



### Logistic Regression

In [18]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10)
lrModel = lr.fit(vecTrain)

IllegalArgumentException: label does not exist. Available: age, gender, weight, height, rotationRate, userAcceleration, act, features