In [1]:
import pandas as pd
import numpy as np
import re


class DataLoader(object):
    def fit(self, dataset):
        self.dataset = dataset.copy()

    # apply regex
    def get_title(self, name):
        pattern = ' ([A-Za-z]+)\.'
        title_search = re.search(pattern, name)
        # If the title exists, extract and return it.
        if title_search:
            return title_search.group(1)
        return ""

    def load_data(self):
        
        #replace values
        falling=[]
        for i in self.dataset['ACTIVITY']:
            if i == 3:
                falling.append('1')
               
            else: 
                falling.append('0')
        self.dataset['FALLING'] = falling

        #outliers
        Q1 = self.dataset.quantile(0.25)
        Q3 = self.dataset.quantile(0.75)
        IQR = Q3 - Q1
        self.dataset = self.dataset[~((self.dataset < (Q1 - 1.5 * IQR)) |(self.dataset > (Q3 + 1.5 * IQR))).any(axis=1)]
        
        #drop columns
        self.dataset = self.dataset.drop(['SL','EEG','ACTIVITY'], axis=1)

        return self.dataset

In [2]:
dl = DataLoader()

In [19]:
db = pd.read_csv('data/train.csv')
dl.fit(db)

In [20]:
df = dl.load_data()

In [21]:
X, y = df.drop(['FALLING'], axis=1), df['FALLING']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

In [10]:
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

In [11]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)

# Print confusion matrix and accuracy score
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)
lr_class_report = classification_report(y_test, lr_predict) 
print(lr_conf_matrix)
print('Accuracy Score :', '%.2f' %lr_acc_score)
print('Classification Report :')
print(lr_class_report)

[[2837   14]
 [ 770    6]]
Accuracy Score : 0.78
Classification Report :
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      2851
           1       0.30      0.01      0.02       776

    accuracy                           0.78      3627
   macro avg       0.54      0.50      0.45      3627
weighted avg       0.68      0.78      0.69      3627



In [12]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
dt_predict = dt.predict(X_test)

# Print confusion matrix and accuracy score
dt_conf_matrix = confusion_matrix(y_test, dt_predict)
dt_acc_score = accuracy_score(y_test, dt_predict)
dt_class_report = classification_report(y_test, dt_predict) 
print(dt_conf_matrix)
print('Accuracy Score :', '%.2f' %dt_acc_score)
print('Classification Report :')
print(dt_class_report)

[[2478  373]
 [ 321  455]]
Accuracy Score : 0.81
Classification Report :
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      2851
           1       0.55      0.59      0.57       776

    accuracy                           0.81      3627
   macro avg       0.72      0.73      0.72      3627
weighted avg       0.81      0.81      0.81      3627



In [13]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn_predict = knn.predict(X_test)

# Print confusion matrix and accuracy score
knn_conf_matrix = confusion_matrix(y_test, knn_predict)
knn_acc_score = accuracy_score(y_test, knn_predict)
knn_class_report = classification_report(y_test, knn_predict) 
print(knn_conf_matrix)
print('Accuracy Score :', '%.2f' %knn_acc_score)
print('Classification Report :')
print(knn_class_report)

[[2527  324]
 [ 442  334]]
Accuracy Score : 0.79
Classification Report :
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      2851
           1       0.51      0.43      0.47       776

    accuracy                           0.79      3627
   macro avg       0.68      0.66      0.67      3627
weighted avg       0.78      0.79      0.78      3627



In [14]:
nb = GaussianNB()
nb.fit(X_train,y_train)
nb_predict = nb.predict(X_test)

# Print confusion matrix and accuracy score
nb_conf_matrix = confusion_matrix(y_test, nb_predict)
nb_acc_score = accuracy_score(y_test, nb_predict)
nb_class_report = classification_report(y_test, nb_predict) 
print(nb_conf_matrix)
print('Accuracy Score :', '%.2f' %nb_acc_score)
print('Classification Report :')
print(nb_class_report)

[[2850    1]
 [ 776    0]]
Accuracy Score : 0.79
Classification Report :
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      2851
           1       0.00      0.00      0.00       776

    accuracy                           0.79      3627
   macro avg       0.39      0.50      0.44      3627
weighted avg       0.62      0.79      0.69      3627



In [15]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
rf_predict = rf.predict(X_test)

# Print confusion matrix and accuracy score
rf_conf_matrix = confusion_matrix(y_test, rf_predict)
rf_acc_score = accuracy_score(y_test, rf_predict)
rf_class_report = classification_report(y_test, rf_predict)
print(rf_conf_matrix)
print('Accuracy Score :','%.2f' %rf_acc_score)
print('Classification Report :')
print(rf_class_report)

[[2620  231]
 [ 342  434]]
Accuracy Score : 0.84
Classification Report :
              precision    recall  f1-score   support

           0       0.88      0.92      0.90      2851
           1       0.65      0.56      0.60       776

    accuracy                           0.84      3627
   macro avg       0.77      0.74      0.75      3627
weighted avg       0.83      0.84      0.84      3627



In [16]:
svc = SVC()
svc.fit(X_train,y_train)
svc_predict = svc.predict(X_test)

# Print confusion matrix and accuracy score
svc_conf_matrix = confusion_matrix(y_test, svc_predict)
svc_acc_score = accuracy_score(y_test, svc_predict)
svc_class_report = classification_report(y_test, svc_predict)
print(svc_conf_matrix)
print('Accuracy Score :','%.2f' %svc_acc_score)
print('Classification Report :')
print(svc_class_report)

[[2851    0]
 [ 776    0]]
Accuracy Score : 0.79
Classification Report :
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      2851
           1       0.00      0.00      0.00       776

    accuracy                           0.79      3627
   macro avg       0.39      0.50      0.44      3627
weighted avg       0.62      0.79      0.69      3627



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
import pickle
import json
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

from utils.dataloader import DataLoader 
from settings.constants import TRAIN_CSV


with open('settings/specifications.json') as f:
    specifications = json.load(f)

raw_train = pd.read_csv(TRAIN_CSV)
x_columns = specifications['description']['X']
y_column = specifications['description']['y']

x_raw = raw_train[x_columns]

loader = DataLoader()
loader.fit(raw_train)
X = loader.load_data().drop('FALLING')
y = loader.load_data().FALLING

print(X.shape,y.shape)

model = RandomForestClassifier()
model.fit(X, y)
with open('models/RFC.pickle', 'wb')as f:
    pickle.dump(model, f)

(12088, 4) (13924,)


ValueError: Found input variables with inconsistent numbers of samples: [12088, 13924]