<a href="https://colab.research.google.com/github/zietho/machine-learning/blob/master/Machine_Learning_00.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np
import re

# Data Set 1: sentiment140

Abstract: This is the sentiment140 dataset. It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment. <sup>1</sup>

<sup>1</sup> https://www.kaggle.com/kazanova/sentiment140

In [3]:
df_sentiment = pd.read_csv('datasets/sentiment/training.1600000.processed.noemoticon.csv', encoding='latin_1', header=None)

In [4]:
df_sentiment.head(2)

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...


In [5]:
def preprocess(df):

    '''
    weekday extraction 1min
    '''
    def funcapply(x):
        return x[0:3]

    df['weekday'] = df['date'].apply(lambda x: funcapply(x))

    '''
    parse time to pandas datetime takes 5 minutes
    from datetime import datetime
    d = datetime.strptime('Thu Apr 23 13:38:19 +0000 2009','%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d %H:%M:%S');
    '''
    df['date'] = pd.to_datetime(df['date'])
    
    return df
  
df_sentiment = preprocess(df_sentiment)

KeyError: 'date'

In [6]:
df_sentiment.shape

(1600000, 6)

In [7]:
df_sentiment.head(10)

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [8]:
df_sentiment.columns = ['label', 'id', 'timestamp', 'query_type', 'username', 'sentiment']

In [9]:
df_sentiment['label'].value_counts()

4    800000
0    800000
Name: label, dtype: int64

In [10]:
df_sentiment['query_type'].value_counts()

NO_QUERY    1600000
Name: query_type, dtype: int64

# Data Set 1: OPPORTUNITY Activity Recognition Dataset

Abstract: "The OPPORTUNITY Dataset for Human Activity Recognition from Wearable, Object, and Ambient Sensors is a dataset devised to benchmark human activity recognition algorithms (classification, automatic data segmentation, sensor fusion, feature extraction, etc). <sup>1</sup>

<sup>1</sup> https://archive.ics.uci.edu/ml/datasets/OPPORTUNITY+Activity+Recognition#

# activity of daily living (ADL)
Read in activity of daily living (ADL) for all users

In [97]:
# read in column names
opportunity_cols = dict()
opportunity_cols_txt = open("datasets/opportunity/column_names.txt", "r")

for line in opportunity_cols_txt:

    if re.search('Column',line) != None:
        col_id = re.search('Column: (\S*) ',line).group(1)
        opportunity_cols[col_id] = dict()
        opportunity_cols[col_id]['name'] = re.search('Column: \S* (\S*)',line).group(1)

        if re.search('Column: \S* \S* \S*',line) != None:
            opportunity_cols[col_id]['sensor'] = re.search('Column: \S* \S* (\S*)',line).group(1)
            opportunity_cols[col_id]['sensor_axis'] = re.search(' (\S*);',line).group(1)
            opportunity_cols[col_id]['value_type'] = re.search('value = (.*),',line).group(1)
            opportunity_cols[col_id]['unit'] = re.search('unit =(.*)$',line).group(1)
            
opportunity_cols

{'1': {'name': 'MILLISEC'},
 '2': {'name': 'Accelerometer',
  'sensor': 'RKN^',
  'sensor_axis': 'accX',
  'value_type': 'round(original_value)',
  'unit': ' milli g'},
 '3': {'name': 'Accelerometer',
  'sensor': 'RKN^',
  'sensor_axis': 'accY',
  'value_type': 'round(original_value)',
  'unit': ' milli g'},
 '4': {'name': 'Accelerometer',
  'sensor': 'RKN^',
  'sensor_axis': 'accZ',
  'value_type': 'round(original_value)',
  'unit': ' milli g'},
 '5': {'name': 'Accelerometer',
  'sensor': 'HIP',
  'sensor_axis': 'accX',
  'value_type': 'round(original_value)',
  'unit': ' milli g'},
 '6': {'name': 'Accelerometer',
  'sensor': 'HIP',
  'sensor_axis': 'accY',
  'value_type': 'round(original_value)',
  'unit': ' milli g'},
 '7': {'name': 'Accelerometer',
  'sensor': 'HIP',
  'sensor_axis': 'accZ',
  'value_type': 'round(original_value)',
  'unit': ' milli g'},
 '8': {'name': 'Accelerometer',
  'sensor': 'LUA^',
  'sensor_axis': 'accX',
  'value_type': 'round(original_value)',
  'unit': '

In [124]:
col_names = list()
for key, column in opportunity_cols.items():
    col_names.append(key+"_"+column.get('name'))
col_names.insert(0,'0_User')
col_names

['0_User',
 '1_MILLISEC',
 '2_Accelerometer',
 '3_Accelerometer',
 '4_Accelerometer',
 '5_Accelerometer',
 '6_Accelerometer',
 '7_Accelerometer',
 '8_Accelerometer',
 '9_Accelerometer',
 '10_Accelerometer',
 '11_Accelerometer',
 '12_Accelerometer',
 '13_Accelerometer',
 '14_Accelerometer',
 '15_Accelerometer',
 '16_Accelerometer',
 '17_Accelerometer',
 '18_Accelerometer',
 '19_Accelerometer',
 '20_Accelerometer',
 '21_Accelerometer',
 '22_Accelerometer',
 '23_Accelerometer',
 '24_Accelerometer',
 '25_Accelerometer',
 '26_Accelerometer',
 '27_Accelerometer',
 '28_Accelerometer',
 '29_Accelerometer',
 '30_Accelerometer',
 '31_Accelerometer',
 '32_Accelerometer',
 '33_Accelerometer',
 '34_Accelerometer',
 '35_Accelerometer',
 '36_Accelerometer',
 '37_Accelerometer',
 '38_InertialMeasurementUnit',
 '39_InertialMeasurementUnit',
 '40_InertialMeasurementUnit',
 '41_InertialMeasurementUnit',
 '42_InertialMeasurementUnit',
 '43_InertialMeasurementUnit',
 '44_InertialMeasurementUnit',
 '45_Iner

In [110]:
adl_filename_mask = 'S{}-ADL{}.dat'
drill_filename_mask = 'S{}-Drill.dat'
df_opportunity_adl = pd.DataFrame()
df_opportunity_drill = pd.DataFrame()

for user_idx in range(1,5):
    for run in range (1,6):
        path = 'datasets/opportunity/'+adl_filename_mask.format(user_idx, run)
        df_partial_adl = pd.read_csv(path, header=None, sep='\s')
        df_partial_adl.insert(0,'User',user_idx)
        df_opportunity_adl = df_opportunity_adl.append(df_partial_adl) 
    
    path = 'datasets/opportunity/'+adl_filename_mask.format(user_idx, run)
    df_partial_drill = pd.read_csv(path, header=None, sep='\s')
    df_partial_drill.insert(0,'User',user_idx)
    df_opportunity_drill = df_opportunity_drill.append(df_partial_adl)

print(df_opportunity_adl.shape)

#df_adl = pd.read_csv('drive/My Drive/University/Data Science/Machine Learning/datasets/opportunity/S1-ADL1.dat', encoding='latin_1', header=None, sep='\s')

  if __name__ == '__main__':
  


(644635, 251)


In [125]:
df_opportunity_adl.columns = col_names
#df_opportunity_drill.shape

In [0]:
df_adl.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,0,87.0,975.0,-287.0,11.0,1001.0,163.0,95.0,975.0,152.0,...,5789.0,2907.0,1447.0,0,0,0,0,0,0,0
1,33,124.0,978.0,-389.0,-7.0,1014.0,199.0,124.0,968.0,123.0,...,5789.0,2908.0,1443.0,0,0,0,0,0,0,0
2,67,102.0,996.0,-440.0,-49.0,1024.0,193.0,127.0,1001.0,113.0,...,5789.0,2910.0,1440.0,0,0,0,0,0,0,0
3,100,59.0,861.0,-384.0,-9.0,1023.0,202.0,110.0,1007.0,106.0,...,5789.0,2912.0,1440.0,0,0,0,0,0,0,0
4,133,119.0,946.0,-426.0,-22.0,1026.0,188.0,98.0,1001.0,92.0,...,5791.0,2915.0,1442.0,0,0,0,0,0,0,0
5,167,99.0,972.0,-365.0,-3.0,1027.0,162.0,72.0,994.0,136.0,...,5790.0,2917.0,1445.0,0,0,0,0,0,0,0
6,200,116.0,960.0,-344.0,30.0,978.0,175.0,41.0,994.0,123.0,...,5786.0,2917.0,1450.0,0,0,0,0,0,0,0
7,233,171.0,945.0,-330.0,-4.0,995.0,201.0,55.0,975.0,123.0,...,5783.0,2919.0,1456.0,0,0,0,0,0,0,0
8,267,145.0,971.0,-355.0,-11.0,1031.0,222.0,65.0,1009.0,123.0,...,5781.0,2921.0,1464.0,0,0,0,0,0,0,0
9,300,106.0,950.0,-356.0,12.0,1027.0,172.0,62.0,1092.0,87.0,...,5779.0,2925.0,1473.0,0,0,0,0,0,0,0
