In [None]:
%autosave 0

In [None]:
import pandas as pd
import numpy as np
import os as os
import matplotlib.pyplot as plt

import re
import nltk

# Data Set 1: sentiment140

Abstract: This is the sentiment140 dataset. It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment. <sup>1</sup>

<sup>1</sup> https://www.kaggle.com/kazanova/sentiment140

## Load raw sentiment140.csv

In [None]:
dataset_path = 'datasets/sentiment/part-1-training.1600000.processed.noemoticon.csv.csv'
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]

df_raw = pd.read_csv(dataset_path, encoding ='latin_1' , names=DATASET_COLUMNS)
df_raw.head()

### Preprocess: extract day from date to sep col

In [None]:
def export_to_csv(df, file_name):
    print('write file to: ', file_name)
    df.to_csv(file_name, encoding='utf-8', index=False)

def clean_date(df):
    '''
    weekday extraction 1min
    '''
    def funcapply(x):
        return x[0:3]
    df['weekday'] = df['date'].apply(lambda x: funcapply(x))

    '''
    parse time to pandas datetime takes 5 minutes
    from datetime import datetime
    d = datetime.strptime('Thu Apr 23 13:38:19 +0000 2009','%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d %H:%M:%S');
    '''
    df['date'] = pd.to_datetime(df['date'])
    return df

def preprocess_tweet(tweet):
    #convert the tweet to lower case
    tweet = tweet.lower()
    #convert all urls to string "URL"
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #convert all @username to "AT_USER"
    #tweet = re.sub('@[^\s]+','AT_USER', tweet)
    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', tweet)
    #convert "#topic" to just "topic"
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    return tweet

def preprocess(df):
    '''
    call all other functions
    create new columns
    '''
    print("clean date...")    
    # split and clean date
    df_new = clean_date(df)
    print("clean text...")
    # clean tweet text
    df_new['text'] = df_new['text'].apply(preprocess_tweet)
    print("add wc...")
    # add col text wordcount
    df_new['text_wc'] = [len(nltk.word_tokenize(t)) for t in df_new.text]
    print("add len...")
    # add col text string length
    df_new['text_len'] = [len(t) for t in df_new.text]
    return df_new
    
#df_clean = preprocess(df_raw)
#dataset_path = 'datasets/sentiment/sentiment140_date_clean.csv'
#export_to_csv(df_clean, './data/sentiment140_date_clean.csv')

## Read Clean sentiment140

In [None]:
dataset_path = 'datasets/sentiment/sentiment140_date_clean.csv'
df_clean = pd.read_csv(dataset_path, encoding='utf-8', header=0)
df_clean['date'] = pd.to_datetime(df_clean['date'])
df_clean.head()

## Column Data Types 

In [None]:
df_clean.dtypes

## Shape and Size

In [None]:
print("Dataset has ", df_clean.shape[0], "rows and ", df_clean.shape[1], "columns.")

## Sentiment Distribution

In [None]:
target_cnt = df_clean.target.value_counts()
target_cnt = target_cnt.to_frame()
target_cnt.reset_index(inplace=True)
target_cnt.columns = ['SENTIMENT', 'COUNT']
target_cnt.SENTIMENT = target_cnt.SENTIMENT.map({4:'pos', 2:'neu', 0:'neg'})

plt.figure(figsize=(10,5))
plt.bar(target_cnt.SENTIMENT, target_cnt.COUNT)
plt.title("Sentiment label distribuition")

target_cnt

## Distribution by days

In [None]:
g = df_clean.groupby(by=df_clean.date.dt.day).agg('count')

df_time_plot = pd.DataFrame({"day":g.index, "count":g.ids})

plt.rcParams["figure.figsize"] = (20,10)
ax = df_time_plot.plot.bar(x='day', y='count', rot=0)

## Distribution by Weekdays

In [None]:
g = df_clean.groupby(by=df_clean.weekday).agg('count')

df_weekday_plot = pd.DataFrame({"day":g.index, "count":g.ids})
# sort categorical
cats = ['Mon','Tue','Wed','Thu','Fri','Sat', 'Sun']
df_weekday_plot.day = pd.Categorical(df_weekday_plot.day, 
                      categories=cats,
                      ordered=True)
df_weekday_plot.sort_values('day', inplace=True)
df_weekday_plot


plt.rcParams["figure.figsize"] = (15,8)
ax = df_weekday_plot.plot.bar(x='day', y='count', rot=0)

## distribution of text string length

In [None]:
plt.figure(figsize=(15,7))
plt.hist(df_clean.text_len, bins=400)
plt.title("distribution of string length")
plt.xlabel("length")
plt.xlabel("count")
plt.show()

plt.figure(figsize=(15,7))
green_diamond = dict(markerfacecolor='b', marker='D')
plt.boxplot(df_clean.text_len, flierprops=green_diamond) # plot pre_clean_len column
plt.title("distribution of string length")
plt.show()

## distribution of text wordcount

In [None]:
plt.figure(figsize=(15,7))
plt.hist(df_clean.text_wc, bins=300)
plt.xlabel("length")
plt.xlabel("count")
plt.title("wordcount of string length")
plt.show()

plt.figure(figsize=(15,7))
green_diamond = dict(markerfacecolor='b', marker='D')
plt.boxplot(df_clean.text_wc, flierprops=green_diamond) # plot pre_clean_len column
plt.show()

# Data Set 1: OPPORTUNITY Activity Recognition Dataset

Abstract: "The OPPORTUNITY Dataset for Human Activity Recognition from Wearable, Object, and Ambient Sensors is a dataset devised to benchmark human activity recognition algorithms (classification, automatic data segmentation, sensor fusion, feature extraction, etc). <sup>1</sup>

<sup>1</sup> https://archive.ics.uci.edu/ml/datasets/OPPORTUNITY+Activity+Recognition#

# activity of daily living (ADL)
Read in activity of daily living (ADL) for all users

In [None]:
# read in column names
opportunity_cols = dict()
opportunity_cols_txt = open("datasets/opportunity/column_names.txt", "r")

for line in opportunity_cols_txt:

    if re.search('Column',line) != None:
        col_id = re.search('Column: (\S*) ',line).group(1)
        opportunity_cols[col_id] = dict()
        opportunity_cols[col_id]['name'] = re.search('Column: \S* (\S*)',line).group(1)

        if re.search('Column: \S* \S* \S*',line) != None:
            opportunity_cols[col_id]['sensor'] = re.search('Column: \S* \S* (\S*)',line).group(1)
            opportunity_cols[col_id]['sensor_axis'] = re.search(' (\S*);',line).group(1)
            opportunity_cols[col_id]['value_type'] = re.search('value = (.*),',line).group(1)
            opportunity_cols[col_id]['unit'] = re.search('unit =(.*)$',line).group(1)
            
#opportunity_cols

In [None]:
col_names = list()
for key, column in opportunity_cols.items():
    col_names.append(key+"_"+column.get('name'))
col_names.insert(0,'0_User')
#col_names

In [None]:
adl_filename_mask = 'S{}-ADL{}.dat'
drill_filename_mask = 'S{}-Drill.dat'
df_opportunity_adl = pd.DataFrame()
df_opportunity_drill = pd.DataFrame()

for user_idx in range(1,5):
    for run in range (1,6):
        path = 'datasets/opportunity/'+adl_filename_mask.format(user_idx, run)
        df_partial_adl = pd.read_csv(path, header=None, sep='\s')
        df_partial_adl.insert(0,'User',user_idx)
        df_opportunity_adl = df_opportunity_adl.append(df_partial_adl) 
    
    path = 'datasets/opportunity/'+adl_filename_mask.format(user_idx, run)
    df_partial_drill = pd.read_csv(path, header=None, sep='\s')
    df_partial_drill.insert(0,'User',user_idx)
    df_opportunity_drill = df_opportunity_drill.append(df_partial_adl)

print(df_opportunity_adl.shape)

#df_adl = pd.read_csv('drive/My Drive/University/Data Science/Machine Learning/datasets/opportunity/S1-ADL1.dat', encoding='latin_1', header=None, sep='\s')

In [None]:
#export_to_csv(df_opportunity_adl, "datasets/opportunity/opportunity_adl.csv")
#export_to_csv(df_opportunity_drill, "datasets/opportunity/oppportunity_drill.csv")

In [None]:
df_opportunity_adl = pd.read_csv('datasets/opportunity/opportunity_adl.csv')
df_opportunity_drill = pd.read_csv('datasets/opportunity/opportunity_drill.csv')

In [None]:
df_opportunity_adl.shape

In [None]:
df_opportunity_adl.head(10)

In [None]:
df_opportunity_adl.columns = col_names

In [None]:
df_opportunity_adl.head(10)

## Ich bin ein hübsches Markdown