# Python Ecosystem for Machine Learning

In [None]:
import scipy
import numpy
import matplotlib
import pandas

!python --version
print('scipy version: %s'%(scipy.__version__))
print('numpy version: %s'%(numpy.__version__))
print('matplotlib version: %s'%(matplotlib.__version__))
print('pandas version: %s'%(pandas.__version__))

# Crash Course in Python and SciPy

In [None]:
# Strings 
data = 'hello world' 
print(data[0]) 
print(len(data)) 
print(data)

In [None]:
# Numbers 
value = 123.1
print(value) 
value = 10 
print(value)

In [None]:
# Boolean 
a = True 
b = False 
print(a, b)

In [None]:
# Multiple Assignment 
a, b, c = 1, 2, 3 
print(a, b, c)

In [None]:
# No value 
a = None 
print(a)

In [None]:
# If-Then-Else Conditional
value = 99 
if value == 99: 
    print('That is fast')
elif value > 200: 
    print('That is too fast' )
else:
    print('That is safe' )

In [None]:
# For-Loop 
for i in range(10): 
    print(i)

In [None]:
# While-Loop 
i = 0 
while i < 10: 
    print(i) 
    i += 1

In [None]:
# Tuple
a = (1, 2, 3) 
print(a)

In [None]:
# List
mylist = [1, 2, 3] 
print("Zeroth Value: %d" % mylist[0]) 
mylist.append(4) 
print("List Length: %d" % len(mylist)) 
for value in mylist: 
    print(value)

In [None]:
# Dictionary
mydict = {' a' : 1, ' b' : 2, ' c' : 3} 
print("A value: %d" % mydict[' a' ]) 
mydict[' a' ] = 11 
print("A value: %d" % mydict[' a' ]) 
print("Keys: %s" % mydict.keys()) 
print("Values: %s" % mydict.values()) 
for key in mydict.keys(): 
    print(mydict[key])

In [None]:
# Sum function 
def mysum(x, y): 
    return x + y

# Test sum function 
result = mysum(1, 3) 
print(result)

In [None]:
# define an array 
import numpy 
mylist = [1, 2, 3] 
myarray = numpy.array(mylist) 
print(myarray) 
print(myarray.shape)

In [None]:
# access values 
import numpy 
mylist = [[1, 2, 3], [3, 4, 5]] 
myarray = numpy.array(mylist) 
print(myarray) 
print(myarray.shape) 
print("First row: %s" % myarray[0]) 
print("Last row: %s" % myarray[-1]) 
print("Specific row and col: %s" % myarray[0, 2]) 
print("Whole col: %s" % myarray[:, 2])

In [None]:
# arithmetic 
import numpy 
myarray1 = numpy.array([2, 2, 2]) 
myarray2 = numpy.array([3, 3, 3]) 
print("Addition: %s" % (myarray1 + myarray2)) 
print("Multiplication: %s" % (myarray1 * myarray2))

In [None]:
# basic line plot 
import matplotlib.pyplot as plt 
import numpy 
myarray = numpy.array([1, 2, 3]) 
plt.plot(myarray) 
plt.xlabel(' some x axis' ) 
plt.ylabel(' some y axis' ) 
plt.show()

In [None]:
# basic scatter plot 
import matplotlib.pyplot as plt 
import numpy 
x = numpy.array([1, 2, 3]) 
y = numpy.array([2, 4, 6]) 
plt.scatter(x,y) 
plt.xlabel(' some x axis' ) 
plt.ylabel(' some y axis' ) 
plt.show()

In [None]:
# series
import numpy 
import pandas 
myarray = numpy.array([1, 2, 3]) 
rownames = [' a' , ' b' , ' c' ] 
myseries = pandas.Series(myarray, index=rownames)
print(myseries)

In [None]:
print(myseries[0]) 
print(myseries[' a' ])

In [None]:
# dataframe
import numpy 
import pandas 
myarray = numpy.array([[1, 2, 3], [4, 5, 6]]) 
rownames = ['a' , 'b' ] 
colnames = ['one' , 'two' , 'three' ] 
mydataframe = pandas.DataFrame(myarray, index=rownames, columns=colnames) 
print(mydataframe)

In [None]:
print("method 1:") 
print("one column:\n%s" % mydataframe['one']) 
print("method 2:") 
print("one column:\n%s" % mydataframe.one)

# How To Load Machine Learning Data

In [None]:
# Load CSV Using Python Standard Library 
import csv 
import numpy 
filename = 'pima-indians-diabetes.data.csv' 
raw_data = open(filename, 'rt' ) 
reader = csv.reader(raw_data, delimiter=',' , quoting=csv.QUOTE_NONE) 
x = list(reader) 
data = numpy.array(x).astype('float' ) 
print(data.shape)

In [None]:
# Load CSV using NumPy 
from numpy import loadtxt 
filename = 'pima-indians-diabetes.data.csv' 
raw_data = open(filename, 'rt' ) 
data = loadtxt(raw_data, delimiter=",") 
print(data.shape)

In [None]:
# Load CSV from URL using NumPy 
from numpy import loadtxt 
from urllib.request import urlopen 
url = 'https://goo.gl/bDdBiA' 
raw_data = urlopen(url) 
dataset = loadtxt(raw_data, delimiter=",") 
print(dataset.shape)

In [None]:
# Load CSV using Pandas 
from urllib.request import urlopen
from pandas import read_csv 
filename = 'pima-indians-diabetes.data.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(filename, names=names) 
print(data.shape)

In [None]:
# Load CSV using Pandas from URL 
from pandas import read_csv 
url = 'https://goo.gl/bDdBiA' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(url, names=names) 
print(data.shape)

# Understand Your Data With Descriptive Statistics

In [None]:
# View first 20 rows 
from pandas import read_csv 
filename = "pima-indians-diabetes.data.csv" 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(filename, names=names) 
peek = data.head(20)
print(peek)

In [None]:
# Dimensions of your data 
from pandas import read_csv 
filename = "pima-indians-diabetes.data.csv" 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(filename, names=names) 
shape = data.shape 
print(shape)

In [None]:
# Data Types for Each Attribute 
from pandas import read_csv 
filename = "pima-indians-diabetes.data.csv" 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(filename, names=names) 
types = data.dtypes 
print(types)

In [None]:
# Statistical Summary 
from pandas import read_csv 
from pandas import set_option 
filename = "pima-indians-diabetes.data.csv" 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(filename, names=names) 
set_option('display.width', 100) 
set_option('precision', 3) 
description = data.describe() 
print(description)

In [None]:
# Class Distribution
from pandas import read_csv 
filename = "pima-indians-diabetes.data.csv" 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(filename, names=names)
class_counts = data.groupby('class').size() 
print(class_counts)

In [None]:
# Pairwise Pearson correlations 
from pandas import read_csv 
from pandas import set_option 
filename = "pima-indians-diabetes.data.csv" 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(filename, names=names) 
set_option('display.width', 100) 
set_option('precision', 3) 
correlations = data.corr(method='pearson' ) 
print(correlations)

In [None]:
# Skew for each attribute 
from pandas import read_csv 
filename = "pima-indians-diabetes.data.csv" 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(filename, names=names) 
skew = data.skew() 
print(skew)

# Understand Your Data With Visualization

In [None]:
# Univariate Histograms 
from matplotlib import pyplot 
from pandas import read_csv 
filename = 'pima-indians-diabetes.data.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(filename, names=names) 
data.hist() 
pyplot.tight_layout()
pyplot.show()

In [None]:
# Univariate Density Plots 
from matplotlib import pyplot 
from pandas import read_csv 
filename = 'pima-indians-diabetes.data.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(filename, names=names) 
data.plot(kind='density' , subplots=True, layout=(3,3), sharex=False) 
pyplot.tight_layout()
pyplot.show()

In [None]:
# Box and Whisker Plots 
from matplotlib import pyplot 
from pandas import read_csv 
filename = "pima-indians-diabetes.data.csv" 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(filename, names=names) 
data.plot(kind='box' , subplots=True, layout=(3,3), sharex=False, sharey=False) 
pyplot.tight_layout()
pyplot.show()

In [None]:
# Correlation Matrix Plot 
from matplotlib import pyplot 
from pandas import read_csv 
import numpy 
filename = 'pima-indians-diabetes.data.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(filename, names=names) 
correlations = data.corr() 

# plot correlation matrix 
fig = pyplot.figure() 
ax = fig.add_subplot(111) 
cax = ax.matshow(correlations, vmin=-1, vmax=1) 
fig.colorbar(cax) 
ticks = numpy.arange(0,9,1) 
ax.set_xticks(ticks) 
ax.set_yticks(ticks) 
ax.set_xticklabels(names) 
ax.set_yticklabels(names) 
pyplot.show()

In [None]:
# Scatterplot Matrix 
from matplotlib import pyplot 
from pandas import read_csv 
from pandas.plotting import scatter_matrix 
filename = "pima-indians-diabetes.data.csv" 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
data = read_csv(filename, names=names) 
scatter_matrix(data) 
pyplot.show()