## Project Intro: Intel Lab Data

This notebook contains information about data collected from 54 sensors deployed in the Intel Berkeley Research lab between February 28th and April 5th, 2004.

`Mica2Dot` sensors with weather boards collected timestamped topology information, along with humidity, temperature, light and voltage values once every 31 seconds.

Source: http://db.csail.mit.edu/labdata/labdata.html

## Import Libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

## Read from Parqueet file

In [3]:
sensor_signals = pd.read_parquet('output.parquet/')
sensor_signals = sensor_signals[:10000]
sensor_signals.head()

  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


Unnamed: 0,temperature,humidity,light,voltage,datetime,x_coord,y_coord,year,month,mote_id
1,19.9884,37.0933,45.08,2.69964,2004-02-28 00:59:16,21.5,23.0,2004,2,1.0
2,19.3024,38.4629,45.08,2.68742,2004-02-28 01:03:16,21.5,23.0,2004,2,1.0
3,19.1652,38.8039,45.08,2.68742,2004-02-28 01:06:16,21.5,23.0,2004,2,1.0
4,19.175,38.8379,45.08,2.69964,2004-02-28 01:06:46,21.5,23.0,2004,2,1.0
5,19.1456,38.9401,45.08,2.68742,2004-02-28 01:08:45,21.5,23.0,2004,2,1.0


In [4]:
sensor_signals['day'] = sensor_signals['datetime'].apply(lambda x: x.day)
sensor_signals['hour'] = sensor_signals['datetime'].apply(lambda x: x.hour)
sensor_signals['minute'] = sensor_signals['datetime'].apply(lambda x: x.minute)
sensor_signals.drop(['datetime', 'x_coord', 'y_coord'], axis=1, inplace=True)
sensor_signals.head()

Unnamed: 0,temperature,humidity,light,voltage,year,month,mote_id,day,hour,minute
1,19.9884,37.0933,45.08,2.69964,2004,2,1.0,28,0,59
2,19.3024,38.4629,45.08,2.68742,2004,2,1.0,28,1,3
3,19.1652,38.8039,45.08,2.68742,2004,2,1.0,28,1,6
4,19.175,38.8379,45.08,2.69964,2004,2,1.0,28,1,6
5,19.1456,38.9401,45.08,2.68742,2004,2,1.0,28,1,8


## Machine Learning

In [5]:
X = sensor_signals.drop('mote_id', axis=1)
Y = sensor_signals['mote_id']

In [6]:
scaler = StandardScaler()
X[['temperature', 'humidity', 'light', 'voltage']] = scaler.fit_transform(X[['temperature', 'humidity', 'light', 'voltage']])
X.head()

Unnamed: 0,temperature,humidity,light,voltage,year,month,day,hour,minute
1,0.384065,-1.141499,-0.664148,0.779469,2004,2,28,0,59
2,0.002411,-0.747044,-0.664148,0.534649,2004,2,28,1,3
3,-0.07392,-0.648834,-0.664148,0.534649,2004,2,28,1,6
4,-0.068468,-0.639041,-0.664148,0.779469,2004,2,28,1,6
5,-0.084824,-0.609607,-0.664148,0.534649,2004,2,28,1,8


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=122)

In [8]:
model = LogisticRegressionCV(cv=10,verbose=1, n_jobs=-1, scoring='accuracy', solver='lbfgs', penalty='l2')
model.fit(X_train, Y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    4.4s finished


LogisticRegressionCV(Cs=10, class_weight=None, cv=10, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='warn', n_jobs=-1, penalty='l2',
                     random_state=None, refit=True, scoring='accuracy',
                     solver='lbfgs', tol=0.0001, verbose=1)

In [9]:
# use the model to make predictions with the test data
Y_pred = model.predict(X_test)
# how did our model perform?
count_misclassified = (Y_test != Y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(Y_test, Y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 123
Accuracy: 0.96


In [11]:
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))