# KE5108 Hybrid Intelligent Systems - Workshop 1a

## Load libraries

In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from deap import base, creator, tools
from deap.algorithms import eaSimple
import random

## Load data

In [2]:
data_df = pd.read_csv(os.path.join('data', 'WS1Data.csv'), header=1)
data_df.iloc[:5, :8]

Unnamed: 0,Start time,End time,Ad,Start time.1,End time.1,Ad.1,Start time.2,End time.2
0,0.0,0.0,1,7.9,16.6,6,0.0,0.0
1,0.0,0.0,4,14.2,20.1,2,0.0,0.0
2,0.0,0.0,2,0.0,0.0,1,4.6,17.4
3,9.8,16.5,2,17.0,23.9,5,6.8,9.6
4,8.2,12.6,2,5.2,14.3,6,0.0,0.0


In [3]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
Start time      1000 non-null float64
End time        1000 non-null float64
Ad              1000 non-null int64
Start time.1    1000 non-null float64
End time.1      1000 non-null float64
Ad.1            1000 non-null int64
Start time.2    1000 non-null float64
End time.2      1000 non-null float64
Ad.2            1000 non-null int64
Start time.3    1000 non-null float64
End time.3      1000 non-null float64
Ad.3            1000 non-null int64
Start time.4    1000 non-null float64
End time.4      1000 non-null float64
Ad.4            1000 non-null int64
User Clicks     1000 non-null int64
Cost            1000 non-null float64
dtypes: float64(11), int64(6)
memory usage: 132.9 KB


The columns are indexed by website, i.e. each website has three columns start time, end time and Ad. There are a total of 5 websites. Let's reindex the columns.

In [4]:
# Create a multi-index
cols = pd.MultiIndex.from_product([['Website1', 'Website2', 'Website3', 'Website4', 'Website5'], ['Start Time', 'End Time', 'Ad']])
cols

MultiIndex(levels=[['Website1', 'Website2', 'Website3', 'Website4', 'Website5'], ['Ad', 'End Time', 'Start Time']],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], [2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0]])

In [5]:
# Reindex the columns related to the websites.
ws_data_df = data_df.iloc[:, :15].copy()
ws_data_df.columns = cols
ws_data_df.iloc[:5, :12]

Unnamed: 0_level_0,Website1,Website1,Website1,Website2,Website2,Website2,Website3,Website3,Website3,Website4,Website4,Website4
Unnamed: 0_level_1,Start Time,End Time,Ad,Start Time,End Time,Ad,Start Time,End Time,Ad,Start Time,End Time,Ad
0,0.0,0.0,1,7.9,16.6,6,0.0,0.0,4,0.0,0.0,5
1,0.0,0.0,4,14.2,20.1,2,0.0,0.0,5,0.5,18.5,3
2,0.0,0.0,2,0.0,0.0,1,4.6,17.4,5,4.8,11.6,3
3,9.8,16.5,2,17.0,23.9,5,6.8,9.6,4,0.0,0.0,1
4,8.2,12.6,2,5.2,14.3,6,0.0,0.0,5,6.7,12.6,1


In [6]:
data_df.iloc[:5, -2:]

Unnamed: 0,User Clicks,Cost
0,161823,101.4
1,241515,203.0
2,211052,156.8
3,111896,191.9
4,209181,204.2


In [7]:
# Add the last two columns to the reindexed columns.
data_df = pd.concat([ws_data_df, data_df.iloc[:, -2:]], axis=1)
data_df.iloc[:5, -5:]

Unnamed: 0,"(Website5, Start Time)","(Website5, End Time)","(Website5, Ad)",User Clicks,Cost
0,10.2,11.4,3,161823,101.4
1,0.0,0.0,6,241515,203.0
2,0.0,0.0,4,211052,156.8
3,0.0,0.0,6,111896,191.9
4,0.0,0.0,4,209181,204.2


In [8]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
(Website1, Start Time)    1000 non-null float64
(Website1, End Time)      1000 non-null float64
(Website1, Ad)            1000 non-null int64
(Website2, Start Time)    1000 non-null float64
(Website2, End Time)      1000 non-null float64
(Website2, Ad)            1000 non-null int64
(Website3, Start Time)    1000 non-null float64
(Website3, End Time)      1000 non-null float64
(Website3, Ad)            1000 non-null int64
(Website4, Start Time)    1000 non-null float64
(Website4, End Time)      1000 non-null float64
(Website4, Ad)            1000 non-null int64
(Website5, Start Time)    1000 non-null float64
(Website5, End Time)      1000 non-null float64
(Website5, Ad)            1000 non-null int64
User Clicks               1000 non-null int64
Cost                      1000 non-null float64
dtypes: float64(11), int64(6)
memory usage: 132.9 KB


## Linear Regression Model for User Clicks

In [10]:
lin_reg = LinearRegression()
lin_reg.fit(data_df.iloc[:, :15].values, data_df['User Clicks'].values)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
lin_reg.coef_

array([ -8383.88859641,   8903.34957429,    900.18678487, -12460.02614985,
        11882.00004993,    975.2971611 , -11389.39279388,  10470.68986321,
         2313.93383714, -10214.05729374,   9978.65317865,   1444.79676413,
        -8477.03412964,   9208.96107779,   1659.04016265])

In [12]:
lin_reg.intercept_

-31240.825142569433

In [16]:
lin_reg_pred = np.ceil(lin_reg.predict(data_df.iloc[:, :15].values))
lin_reg_pred[:5]

array([ 114292.,  241563.,  190599.,  156065.,  201914.])

In [18]:
lin_reg_mse = mean_squared_error(lin_reg_pred, data_df['User Clicks'].values)
lin_reg_rmse = np.sqrt(lin_reg_mse)
lin_reg_r2 = r2_score(lin_reg_pred, data_df['User Clicks'].values)
print('mse = %f\nrmse = %f\nr2 = %f'% (lin_reg_mse, lin_reg_rmse, lin_reg_r2))

mse = 614494028.755000
rmse = 24788.990071
r2 = 0.903908


## Optimisation Using Genetic Algorithm

In [2]:
# Create the types for the chromosomes and population,
creator.create('FitnessMax', base.Fitness, weights=(1.0,))
creator.create('Individual', list, fitness=creator.FitnessMax)

In [7]:
# Number of genes in each chromosome.
IND_SIZE = 15

# Initialise the population.
toolbox = base.Toolbox()
toolbox.register("attr_float", random.random)
toolbox.register("individual", tools.initRepeat, creator.Individual,
                 toolbox.attr_float, n=IND_SIZE)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [8]:
# Evaluation or fitness function
def evaluate(individual):
    return individual.values * lin_reg.coef_ + lin_reg.intercept_

In [9]:
# Operators
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=1, indpb=0.1)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)

In [None]:
s = tools.Statistics()
pop, logbook = eaSimple(toolbox.population, toolbox, cxpb=.5, mutpb=.5, ngen=10000, stats=)

check out Statistics, Logbook, Hall-Of-Fame