##1.2.3 Convert String to Integer
Book 1 pg 6-7
<br>
Listing 1.15: Sample Output From Integer Encoding Class Values. Pg 7

In [1]:
# Listing 1.13: Function To Integer Encode String Class Values.
# Convert string column to integer
def str_column_to_int(dataset, column):
  class_values = [row[column] for row in dataset]
  unique = set(class_values)
  lookup = dict()
  for i, value in enumerate(unique):
    lookup[value] = i
  for row in dataset:
    row[column] = lookup[row[column]]
  return lookup


In [2]:
# Example of integer encoding string class values
from csv import reader
# Load a CSV file
def load_csv(filename):
  dataset = list()
  with open(filename, 'r') as file:
    csv_reader = reader(file)
    for row in csv_reader:
      if not row:
        continue
      dataset.append(row)
  return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
  for row in dataset:
    row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
  class_values = [row[column] for row in dataset]
  unique = set(class_values)
  lookup = dict()
  for i, value in enumerate(unique):
    lookup[value] = i
  for row in dataset:
    row[column] = lookup[row[column]]
  return lookup

# Load iris dataset
filename = 'iris.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0])))
print(dataset[0])
# convert string columns to float
for i in range(4):
  str_column_to_float(dataset, i)
# convert class column to int
lookup = str_column_to_int(dataset, 4)
print(dataset[0])
print(lookup)

Loaded data file iris.csv with 150 rows and 5 columns
['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']
[5.1, 3.5, 1.4, 0.2, 1]
{'Iris-virginica': 0, 'Iris-setosa': 1, 'Iris-versicolor': 2}


## 2.2.2 Standardize Data
Book 1 pg 15-16
<br>
Listing 2.18: Example Output From Standardizing the Contrived Dataset. Pg 16

In [4]:
# Example of standardizing a contrived dataset
from math import sqrt

# calculate column means
def column_means(dataset):
  means = [0 for i in range(len(dataset[0]))]
  for i in range(len(dataset[0])):
    col_values = [row[i] for row in dataset]
    means[i] = sum(col_values) / float(len(dataset))
  return means

# calculate column standard deviations
def column_stdevs(dataset, means):
  stdevs = [0 for i in range(len(dataset[0]))]
  for i in range(len(dataset[0])):
    variance = [pow(row[i]-means[i], 2) for row in dataset]
    stdevs[i] = sum(variance)
  stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]
  return stdevs

# standardize dataset
def standardize_dataset(dataset, means, stdevs):
  for row in dataset:
    for i in range(len(row)):
      row[i] = (row[i] - means[i]) / stdevs[i]

# Standardize dataset
dataset = [[50, 30], [20, 90], [30, 50]]
print(dataset)

# Estimate mean and standard deviation
means = column_means(dataset)
stdevs = column_stdevs(dataset, means)
print(means)
print(stdevs)

# standardize dataset
standardize_dataset(dataset, means, stdevs)
print(dataset)

[[50, 30], [20, 90], [30, 50]]
[33.333333333333336, 56.666666666666664]
[15.275252316519467, 30.550504633038933]
[[1.0910894511799618, -0.8728715609439694], [-0.8728715609439697, 1.091089451179962], [-0.21821789023599253, -0.2182178902359923]]


Book 1 pg 16-17
<br>
Listing 2.20: Example Output From Standardizing the Diabetes Dataset Pg 17.

In [5]:
# Standardize the Diabetes Dataset
from csv import reader
from math import sqrt

# Load a CSV file
def load_csv(filename):
  dataset = list()
  with open(filename, 'r') as file:
    csv_reader = reader(file)
    for row in csv_reader:
      if not row:
        continue
      dataset.append(row)
  return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
  for row in dataset:
    row[column] = float(row[column].strip())

# calculate column means
def column_means(dataset):
  means = [0 for i in range(len(dataset[0]))]
  for i in range(len(dataset[0])):
    col_values = [row[i] for row in dataset]
    means[i] = sum(col_values) / float(len(dataset))
  return means

# calculate column standard deviations
def column_stdevs(dataset, means):
  stdevs = [0 for i in range(len(dataset[0]))]
  for i in range(len(dataset[0])):
    variance = [pow(row[i]-means[i], 2) for row in dataset]
    stdevs[i] = sum(variance)
  stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]
  return stdevs

# standardize dataset
def standardize_dataset(dataset, means, stdevs):
  for row in dataset:
    for i in range(len(row)):
      row[i] = (row[i] - means[i]) / stdevs[i]

# Load pima-indians-diabetes dataset
filename = 'pima-indians-diabetes.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0])))

# convert string columns to float
for i in range(len(dataset[0])):
  str_column_to_float(dataset, i)
print(dataset[0])

# Estimate mean and standard deviation
means = column_means(dataset)
stdevs = column_stdevs(dataset, means)

# standardize dataset
standardize_dataset(dataset, means, stdevs)
print(dataset[0])

Loaded data file pima-indians-diabetes.csv with 768 rows and 9 columns
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
[0.6395304921176576, 0.8477713205896718, 0.14954329852954296, 0.9066790623472505, -0.692439324724129, 0.2038799072674717, 0.468186870229798, 1.4250667195933604, 1.3650063669598067]
