In [2]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

In [3]:
regression_example = \
r'C:\Users\Brian\Desktop\udemy\aws_machine_learning\Data\RegressionExamples\BikeTrain\train.csv'

binary_class_example = \
r'C:\Users\Brian\Desktop\udemy\aws_machine_learning\Data\ClassExamples\DiabetesData\pima-indians-diabetes.data.txt'

multi_class_example = \
r'C:\Users\Brian\Desktop\udemy\aws_machine_learning\ClassExamples\Iris\iris.data.csv'

data_type_example = \
r'C:\Users\Brian\Desktop\udemy\aws_machine_learning\ClassExamples\TwitterAWS\aml_training_dataset.csv'

## Labeled Data 
Labeled Data - contains the features and target attribute with correct answer
## Training Set
Part of labeled data that is used for training the model. 60-70% of the labeled data is used for training
## Evaluation Set
30-40% of the labeled data is reserved for checking prediction quality with known correct answer
## Training Example
Training Example or a row. Contains one complete observation of features and the correct answer
## Features
Known by different names: Columns, Features, variables, attributes. These are values that define a particular example. Values for some of the features could be missing or invalid in real-world datasets.  So, some cleaning may have to be done before feeding to Machine Learning  

1. Input Feature - Variables that are provided as input to the model
2. Target Attribute - Variable that model needs to predict



<h2>AWS ML Data Types</h2>
<h5>Data from training set needs to be mapped to one of these data types</h5>
<h4>1. Binary.  Can contain only two state 1/0.</h4>
  <ul>
  <li>Positive Values: 1,y,yes,t,true</li>
  <li>Negative Values: 0,n,no,f,false</li>
  <li>Values are case-insensitive and AWS ML converts the above values to 1 or 0</li>
  </ul>
<h4>2. Categorical.  Qualitative attribute that describes something about an observation.</h4>
<h5>Example</h5>
   <ul>
   <li>Day of the week: Sunday, Monday, Tuesday,...</li>
   <li>Size: XL,L,M,S </li>
   <li>Month: 1,2,3,4,...12</li>
   <li>Season: 1,2,3,4</li>
   </ul>
<h4>3. Numeric. Measurements, Counts are represented as numeric types</h4>
   <ul>
   <li>Discrete: 20 cars, 30 trees, 2 ships</li>
   <li>Continous: 98.7 degree F, 32.5 KMPH, 2.6 liters </li>   
   </ul>
<h4>4. Text.  String of words. AWS ML automatically tokenizes at white space boundary</h4>
<h5>Example</h5>
   <ul>
   <li>product description, comment, reviews</li>
   <li>Tokenized: 'Wildlife Photography for beginners' => {‘Wildlife’, ‘Photography’, ‘for’, ‘beginners’}</li>   
   </ul>   

<h1>Algorithms</h1>
<h2>Linear Regression</h2>
Predict a numeric value as output based on given features
<p>Examples: 
What is the market value of a car? 
What is the current value of a House?
For a product, how many units can we sell?</p>

<h5>Concrete Example</h5>
Kaggle Bike Rentals - Predict number of bike rentals every hour.  Total should include both casual
    rentals and registered users rentals

Input Columns/Features = ['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
                          'atemp', 'humidity', 'windspeed']

Output Column/Target Attribute = 'count'<br>
count = casual + registered <br>
Option 1: Predict casual and registered counts separately and then sum it up <br>
Option 2: Predict count directly

In [4]:
# read the bike train csv file
df = pd.read_csv(regression_example)

In [5]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [6]:
df.corr()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
season,1.0,0.029368,-0.008126,0.008879,0.258689,0.264744,0.19061,-0.147121,0.096758,0.164011,0.163439
holiday,0.029368,1.0,-0.250491,-0.007074,0.000295,-0.005215,0.001929,0.008409,0.043799,-0.020956,-0.005393
workingday,-0.008126,-0.250491,1.0,0.033772,0.029966,0.02466,-0.01088,0.013373,-0.319111,0.11946,0.011594
weather,0.008879,-0.007074,0.033772,1.0,-0.055035,-0.055376,0.406244,0.007261,-0.135918,-0.10934,-0.128655
temp,0.258689,0.000295,0.029966,-0.055035,1.0,0.984948,-0.064949,-0.017852,0.467097,0.318571,0.394454
atemp,0.264744,-0.005215,0.02466,-0.055376,0.984948,1.0,-0.043536,-0.057473,0.462067,0.314635,0.389784
humidity,0.19061,0.001929,-0.01088,0.406244,-0.064949,-0.043536,1.0,-0.318607,-0.348187,-0.265458,-0.317371
windspeed,-0.147121,0.008409,0.013373,0.007261,-0.017852,-0.057473,-0.318607,1.0,0.092276,0.091052,0.101369
casual,0.096758,0.043799,-0.319111,-0.135918,0.467097,0.462067,-0.348187,0.092276,1.0,0.49725,0.690414
registered,0.164011,-0.020956,0.11946,-0.10934,0.318571,0.314635,-0.265458,0.091052,0.49725,1.0,0.970948


In [7]:
df['count'].describe()

count    10886.000000
mean       191.574132
std        181.144454
min          1.000000
25%         42.000000
50%        145.000000
75%        284.000000
max        977.000000
Name: count, dtype: float64

In [8]:
df.season.value_counts()

4    2734
3    2733
2    2733
1    2686
Name: season, dtype: int64

In [9]:
df.holiday.value_counts()

0    10575
1      311
Name: holiday, dtype: int64

In [10]:
df.workingday.value_counts()

1    7412
0    3474
Name: workingday, dtype: int64

In [11]:
df.weather.value_counts()

1    7192
2    2834
3     859
4       1
Name: weather, dtype: int64

In [12]:
df.temp.describe()

count    10886.00000
mean        20.23086
std          7.79159
min          0.82000
25%         13.94000
50%         20.50000
75%         26.24000
max         41.00000
Name: temp, dtype: float64

<h2>Binary Classification</h2>
Predict a binary class as output based on given features

<p>Examples: Do we need to follow up on a customer review?
Is this transaction fraudulent or valid one?
Are there signs of onset of a medical condition or disease?
Is this considered junk food or not?
</p>


<h5>Concrete Example</h5>
pima-indians-diabetes - Predict if a given patient has a risk of getting diabetes<br>
https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes

Input Columns/Features = ['preg_count', 'glucose_concentration', 'diastolic_bp',
       'triceps_skin_fold_thickness', 'two_hr_serum_insulin', 'bmi',
       'diabetes_pedi', 'age']

Output Column/Target Attribute = 'diabetes_class'.  1 = diabetic, 0 = normal

In [13]:
df = pd.read_csv(binary_class_example)

In [14]:
df.head()

Unnamed: 0,preg_count,glucose_concentration,diastolic_bp,triceps_skin_fold_thickness,two_hr_serum_insulin,bmi,diabetes_pedi,age,diabetes_class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [15]:
df.columns

Index(['preg_count', 'glucose_concentration', 'diastolic_bp',
       'triceps_skin_fold_thickness', 'two_hr_serum_insulin', 'bmi',
       'diabetes_pedi', 'age', 'diabetes_class'],
      dtype='object')

In [16]:
df.corr()

Unnamed: 0,preg_count,glucose_concentration,diastolic_bp,triceps_skin_fold_thickness,two_hr_serum_insulin,bmi,diabetes_pedi,age,diabetes_class
preg_count,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
glucose_concentration,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
diastolic_bp,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
triceps_skin_fold_thickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
two_hr_serum_insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
bmi,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
diabetes_pedi,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
diabetes_class,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [17]:
df.age.value_counts().head()

22    72
21    63
25    48
24    46
23    38
Name: age, dtype: int64

In [18]:
df.diabetes_class.value_counts()

0    500
1    268
Name: diabetes_class, dtype: int64

<h2>Multiclass Classification</h2>
Predict a class as output based on given features

Examples:
    1. How healthy is the food based on given ingredients?
    Classes: Healthy, Moderate, Occasional, Avoid.
    
    2. Identify type of mushroom based on features
    
    3. What type of advertisement can be placed for this search?

<h5>Concrete Example</h5>
Iris Classification - Predict the type of Iris plant based on flower measurments<br>
https://archive.ics.uci.edu/ml/datasets/Iris

Input Columns/Features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

Output Column/Target Attribute = 'class'.  

Class: Iris-setosa,Iris-virginica,Iris-versicolor

In [19]:
df = pd.read_csv(multi_class_example)

OSError: File b'C:\\Users\\Brian\\Desktop\\udemy\\aws_machine_learning\\ClassExamples\\Iris\\iris.data.csv' does not exist

In [20]:
np.random.seed(5)
# print 10 random rows
df.ix[np.random.randint(0,df.shape[0],10)]

Unnamed: 0,preg_count,glucose_concentration,diastolic_bp,triceps_skin_fold_thickness,two_hr_serum_insulin,bmi,diabetes_pedi,age,diabetes_class
206,8,196,76,29,280,37.5,0.605,57,1
701,6,125,78,31,0,27.6,0.565,49,1
118,4,97,60,23,0,28.2,0.443,22,0
400,4,95,64,0,0,32.0,0.161,31,1
73,4,129,86,20,270,35.1,0.231,23,0
8,2,197,70,45,543,30.5,0.158,53,1
740,11,120,80,37,150,42.3,0.785,48,1
743,9,140,94,0,0,32.7,0.734,45,1
411,1,112,72,30,176,34.4,0.528,25,0
624,2,108,64,0,0,30.8,0.158,21,0


In [None]:
df.columns

In [27]:
df['class'].value_counts()

Iris-versicolor    50
Iris-virginica     50
Iris-setosa        50
Name: class, dtype: int64

In [None]:
df.sepal_length.describe()

# Data Types

In [None]:
df = pd.read_csv(data_type_example)

In [None]:
df.columns

In [None]:
df[['description','favourites_count','favorited','text', 'screen_name','trainingLabel']].head()