In [1]:
#Importing dependencies
import random
import csv
import math
import os

In [2]:
#The csv upload function
def csv_upload(filename):
    dataset = []
    with open(filename, 'r') as file:
        csv_reader = csv.reader(file)
        
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
        del dataset[0]
        for row in dataset:
            del row[4]
    return dataset

In [3]:
#turn the data from string to float
def string_to_float(dataset):
    for row in dataset:
        for c in range(len(row)):
            row[c] = float(row[c].strip())
    return dataset        

In [4]:
random.seed(46)

In [5]:
#Standardisation of the dataset
#Getting the minimum and maximum column values
def min_max_col_values(dataset):
    minMax = []
    for c in range(len(dataset[0])):
        col_values = [row[c] for row in dataset]
        min_col = min(col_values)
        max_col = max(col_values)
        
        minMax.append([min_col, max_col])
    
    return minMax

In [6]:
#Normalization function
def minMax_normalization(dataset, minMax):
    for row in dataset:
        for c in range(len(dataset[0])):
            row[c] = float("{0:.4f}".format((row[c] - minMax[c][0]) / (minMax[c][1] - minMax[c][0])))
    return dataset

In [7]:
#splitting the data into k-folds for cross validation 
def cross_validation_split(dataset, n_folds):
    dataSplit = []
    datasetCopy = list(dataset)
    foldSize = len(dataset)/ n_folds
    
    for i in range(n_folds):
        fold = []
        while len(fold) < foldSize:
            index = random.randrange(len(datasetCopy))
            fold.append(datasetCopy.pop(index))
        dataSplit.append(fold)
    return dataSplit
    

In [8]:
#The prediction function
def predict(row, coef):
    yhat = coef[0]
    for i in range(len(row)-1):
        yhat += coef[i+1]*row[i]
    return yhat
       

In [9]:
#Calculate the RMSE
def rmse(actual, predicted):
    sum_error = 0
    for i in range(len(actual)):
        error = (actual[i] - predicted[i])**2
        sum_error += error
        
    rmse = math.sqrt(sum_error/len(actual))
    return rmse

In [10]:
# The sgd algorithm
def sgd(train, n_epoch, l_rate):
    coef = [0.0 for i in range(len(train[0]))]
    for epoch in range(n_epoch):
        for row in train:
            yhat = predict(row,coef)
            error = yhat - row[-1]
            coef[0] = coef[0] - l_rate * error
            for i in range(len(row)-1):
                coef[i+1] = coef[i+1] - l_rate * error * row[i]
                
        return coef

In [11]:
# Linear Regression Algorithm using the sgd
def linear_regression(train,test,l_rate, n_epoch):
    prediction = list()
    coef = sgd(train, n_epoch, l_rate)
    
    for row in test:
        yhat = predict(row,coef)
        prediction.append(yhat)
    
    return prediction

In [12]:
#Evaluating the SGD algorithm using the cross-validation resampling method
def evaluate_sgd(train_set,n_folds,l_rate,n_epoch):
    folds = cross_validation_split(train_set, n_folds)
    scores = []
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = []
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
        predicted = linear_regression(train_set,test_set,l_rate,n_epoch)
        actual = [row[-1] for row in fold]
        sum_error = 0
        for i in range(len(actual)):
            error = (actual[i] - predicted[i])**2
            sum_error += error
            
        rmse = math.sqrt(sum_error/len(actual))
        scores.append(rmse)
    return scores
        

In [13]:
#Linear Regression on the iris_dataset
filename = "/Users/pw/Desktop/Machine Learning/mlwork/iris.csv"
dataset = csv_upload(filename)

In [14]:
dataset[0]

['5.1', '3.5', '1.4', '0.2']

In [15]:
data_float = string_to_float(dataset)

In [16]:
data_float[0]

[5.1, 3.5, 1.4, 0.2]

In [17]:
#Normalization ''' getting the minimax list (Min and max of all the columns)'''
minMax = min_max_col_values(data_float)
minMax

[[4.3, 7.9], [2.0, 4.4], [1.0, 6.9], [0.1, 2.5]]

In [18]:
#Standardization of the iris dataset
data_normalized = minMax_normalization(data_float, minMax)

In [19]:
data_normalized[0]

[0.2222, 0.625, 0.0678, 0.0417]

In [20]:
# Evaluating the algorithm
n_folds = 3
l_rate = 0.01
n_epoch = 50
scores = evaluate_sgd(data_normalized,n_folds,l_rate,n_epoch)
print('Scores: %s' % scores)
print('Mean RMSE: %.2f ' % (sum(scores)/float(len(scores))))

Scores: [0.24974462488342228, 0.25566255787632636, 0.26381423125878467]
Mean RMSE: 0.26 
