# Fall / Non Fall Classification

In [215]:
import sys, os
from pyspark.sql import SparkSession, types, functions
from pyspark import SparkConf, SparkContext
import sys
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, SQLTransformer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import ensemble
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np


spark = SparkSession.builder.appName('Big Data 2 Project').getOrCreate()
sc = spark.sparkContext
#assert sys.version_info >= (3, 4)  # make sure we have Python 3.4+
#assert spark.version >= '2.2'  # make sure we have Spark 2.2+
data=pd.read_csv('Dataset.csv',header='infer')
df=data.drop('Unnamed: 0',1)

In [117]:
train=df.iloc[0:480,:]
test=df.iloc[480:,:]
print("(Rows, Columns) in training dataset",train.shape)
print("Number of Features in training dataset:-",train.shape[1])
print("Number of Features in testing dataset:-",test.shape[1])

(Rows, Columns) in training dataset (480, 135)
Number of Features in training dataset:- 135
Number of Features in testing dataset:- 135


In [118]:
#Separating training data Features and Predictions into different dataframes
X_train=train.drop('class_label',axis=1)
Y_train=train['class_label']

#Separating testing data Features and Predictions into different dataframes
X_test=test.drop('class_label',axis=1)
Y_test=test['class_label']

In [119]:
#Performing Feature Selection on Training Dataset
print("Number of Features in training dataset(before feature selection by Variance Threshold):-",X_train.shape[1])
X_train_temp = X_train.copy(deep=True)  # Make a deep copy of the Training Data dataframe
selector = VarianceThreshold(0.12)
selector.fit(X_train_temp)
X_res = X_train_temp.loc[:, selector.get_support(indices=False)]
X_train=X_res
print("Number of Features in training dataset(after feature selection by Variance Threshold):-",X_train.shape[1])

Number of Features in training dataset(before feature selection by Variance Threshold):- 134
Number of Features in training dataset(after feature selection by Variance Threshold):- 126


In [120]:
traini = pd.concat([X_train, Y_train], axis=1, join='inner')

# Find most important features relative to target i.e finding correlation of every individual feature i.e independent variable with dependent variable and then sorting them and using the features that have maximum correlation
print("Find most important features relative to target through correlation")
corr = traini.corr()
corr.sort_values(["class_label"], ascending = False, inplace = True)

#Selecting top-20 features
top_features=corr.class_label[1:20].to_frame()

Find most important features relative to target through correlation


In [121]:
features=[]
columns=top_features.index
for col in columns:
    features.append(col)
print(features)

['var_waistAccelerationX', 'mean_waistAccelerationY', 'var_headAccelerationY', 'var_rankleMagneticFieldX', 'mean_waistAngularVelocityZ', 'var_sternumAccelerationY', 'var_waistAccelerationY', 'var_rthighMagneticFieldX', 'var_sternumAccelerationX', 'mean_headAccelerationY', 'var_waistAngularVelocityZ', 'var_rthighAccelerationZ', 'var_sternumAngularVelocityZ', 'mean_sternumAccelerationY', 'mean_rthighAccelerationZ', 'var_rankleMagneticFieldZ', 'var_waistMagneticFieldX', 'var_lthighMagneticFieldX', 'mean_waistAngularVelocityX']


In [122]:
#Selecting Features for training dataset
X_train=X_train[features]
print(X_train.shape[1])
print(X_train.columns)

#Selecting Same Features for testing dataset
X_test=X_test[X_train.columns]
print(X_test.shape[1])
print(X_test.columns)

19
Index(['var_waistAccelerationX', 'mean_waistAccelerationY',
       'var_headAccelerationY', 'var_rankleMagneticFieldX',
       'mean_waistAngularVelocityZ', 'var_sternumAccelerationY',
       'var_waistAccelerationY', 'var_rthighMagneticFieldX',
       'var_sternumAccelerationX', 'mean_headAccelerationY',
       'var_waistAngularVelocityZ', 'var_rthighAccelerationZ',
       'var_sternumAngularVelocityZ', 'mean_sternumAccelerationY',
       'mean_rthighAccelerationZ', 'var_rankleMagneticFieldZ',
       'var_waistMagneticFieldX', 'var_lthighMagneticFieldX',
       'mean_waistAngularVelocityX'],
      dtype='object')
19
Index(['var_waistAccelerationX', 'mean_waistAccelerationY',
       'var_headAccelerationY', 'var_rankleMagneticFieldX',
       'mean_waistAngularVelocityZ', 'var_sternumAccelerationY',
       'var_waistAccelerationY', 'var_rthighMagneticFieldX',
       'var_sternumAccelerationX', 'mean_headAccelerationY',
       'var_waistAngularVelocityZ', 'var_rthighAccelerationZ',
  

In [123]:
X_train['class_label']=Y_train
X_test['class_label']=Y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [193]:
from pyspark.ml.classification import GBTClassifier,RandomForestClassifier
df_train=spark.createDataFrame(X_train)
df_test=spark.createDataFrame(X_test)

In [194]:
prediction_Col_Name = "prediction"
vecAssembler = VectorAssembler(inputCols=df_train.schema.names[:-1],
                                       outputCol="features")

In [195]:
#nb=NaiveBayes(featuresCol="features", labelCol="class_label",predictionCol=prediction_Col_Name)
gbt=GBTClassifier(featuresCol='features',labelCol='class_label',maxDepth=20)
rf=RandomForestClassifier(featuresCol='features',labelCol='class_label')

In [196]:
RF_Classifier = [vecAssembler, rf]

models = [('RF Classifier', Pipeline(stages=RF_Classifier))]

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="class_label")

In [197]:
for label, pipeline in models:
    model = pipeline.fit(df_train)
    pred = model.transform(df_test)

In [198]:
pred_cols=pred.select('prediction','class_label')

In [208]:
from pyspark.sql import types, functions
pred_cols.createOrReplaceTempView('metric')

In [211]:
tp=spark.sql('''select count(*) from metric where class_label=1 and prediction=1''').head()[0]
fp=spark.sql('''select count(*) from metric where class_label=0 and prediction=1''').head()[0]
tn=spark.sql('''select count(*) from metric where class_label=0 and prediction=0''').head()[0]
fn=spark.sql('''select count(*) from metric where class_label=1 and prediction=0''').head()[0]

In [212]:
sensitivity=tp/(tp+fn)
specificity=tn/(tn+fp)

In [213]:
sensitivity

0.9710144927536232

In [214]:
specificity

0.9215686274509803