In [1]:
import time
import random
import pandas as pd
import numpy as np
import copy

from sklearn.metrics import accuracy_score

### Load the data from the dataframe

In [2]:
dataframe = pd.read_csv('./train.csv')
testframe = pd.read_csv('./test.csv')

In [3]:
print(dataframe.shape)
print(testframe.shape)

(1460, 81)
(1459, 80)


In [4]:
dataframe.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
dataframe.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
enc = LabelEncoder()

for i in dataframe.columns:
  if dataframe.dtypes[i] == 'O':
    dataframe[i] = enc.fit_transform(dataframe[i])

dataframe.head()

for i in testframe.columns:
  if testframe.dtypes[i] == 'O':
    testframe[i] = enc.fit_transform(testframe[i])

In [8]:
nullCols = dataframe.isnull().sum()
nullCols[nullCols != 0]

LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64

In [9]:
dataframe.fillna(method= 'ffill', inplace= True)
testframe.fillna(method= 'ffill', inplace= True)

In [10]:
dataframe

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,2,3,3,0,...,0,3,4,4,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,2,3,3,0,...,0,3,4,4,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,2,0,3,0,...,0,3,4,4,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,2,0,3,0,...,0,3,4,4,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,2,0,3,0,...,0,3,4,4,0,12,2008,8,4,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3,62.0,7917,1,2,3,3,0,...,0,3,4,4,0,8,2007,8,4,175000
1456,1457,20,3,85.0,13175,1,2,3,3,0,...,0,3,2,4,0,2,2010,8,4,210000
1457,1458,70,3,66.0,9042,1,2,3,3,0,...,0,3,0,2,2500,5,2010,8,4,266500
1458,1459,20,3,68.0,9717,1,2,3,3,0,...,0,3,4,4,0,4,2010,8,4,142125


In [11]:
nullCols = testframe.isnull().sum()
nullCols[nullCols != 0]

Series([], dtype: int64)

These functions are used to print the unique values of every feature, and the number of missing values for each feature. The loaded dataset is already preprocessed, so all the values are numbers between 0 and 1 and there are no missing values.

### Define functions to be used for the Genetic Algorithm

In [12]:
dataframe.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [13]:
from sklearn.model_selection import train_test_split


def generate_dataframes_for_training(dataframe, split_frac=0.8):
    """
    Generates training and testing dataframes from a complete dataframe, according to the split_frac parameter
    """
    print(dataframe.columns)
    X = dataframe.drop(['SalePrice'], axis= 1)
    y = dataframe['SalePrice']

    return train_test_split(X, y, train_size= split_frac, random_state=52)
    
    # return X_train, X_test, y_train, y_test

In [14]:

def generate_random_individuals(num_individuals, num_features, max_features=None, verbose=False):
    """
    Randomly generates individuals

    The number of individuals to generate is given by the num_individuals parameter
    The length of each individual is equal to the num_features parameter
    The maximum number of active features for every individual is given by the max_features parameter
    """
    if verbose: print('GENERATING RANDOM INDIVIDUALS.... ')
        
    individuals = list()
    
    for _ in range(num_individuals):
        individual = ''
        for col in range(num_features):
            # For each char in the individual, a 1 or a 0 is randomly generated
            if individual.count('1') == max_features:
                individual += '0'
                continue
                
            individual += str(random.randint(0, 1))
            
        if verbose: print(f'Genrated a new indivudal: {individual}')
        individuals.append(individual)
        
    if verbose: print(f'Generated list of {num_individuals} individuals: {individuals}')
        
    return individuals

In [15]:
def get_weights(population):
    """
    Calculate weights from the population filled with the accuracies
    """
    total_accuracies = 0
    new_population = []
    
    # Get the sum of all accuracies of the population
    for individual in population:
        total_accuracies += individual[1]
        
    # For each individual, calculate its weight by dividing its accuracy by the overall sum calculated above
    for individual in population:
        weight = individual[1]/total_accuracies
        # Store the individual and its weight in the final population list
        new_population.append((individual[0], float(weight*100)))
        
    return new_population



def get_fitness_func(individual, dataframe, verbose=False):
    """
    Calculate accuracy for the individual passed as parameter.
    Both the dataframe and the y_data parameters are used for training and evaluating the model.
    """
    if verbose: print('Calculating accuracy for individual ', individual)

    from sklearn.linear_model import LinearRegression

    X_train, X_test, y_train, y_test = generate_dataframes_for_training(dataframe)

    X_train = X_train.loc[:, [True if individual[i] == '1' else False for i in range(len(individual))]]
    X_test = X_test.loc[:, [True if individual[i] == '1' else False for i in range(len(individual))]]  
    
    model = LinearRegression()
    model.fit(X_train, y_train)

    return model.score(X_test, y_test)




def fill_population(individuals, dataframe, verbose=False):
    """
    Fills the population list with individuals and their weights
    """
    population = list()
    
    for individual in individuals:
        
        # Get the value of the fitness function (accuracy of the model)
        if verbose: print(f'Calculating fitness function value for individual {individual}')
        accuracy = get_fitness_func(individual, dataframe, verbose)
        
        # Check that the value is not the goal state (in this case, an accuracy of 80% is a terminal state)
        if float(accuracy) > 0.87:
            if verbose: print(f'Goal state found for individual {individual}')
            return individual
            
        individual_complete = (individual, accuracy)
        population.append(individual_complete)
        
    # The final population list is created, which contains each individual together with its weight
    # (weights will be used in the reproduction step)
    new_population = get_weights(population)
    if verbose: print(f'Generated population list (with weights): {new_population}')
    
    return new_population



def choose_parents(population):
    """
    From the population, weighting the probabilities of an individual being chosen via the fitness
    function, takes randomly two individual to reproduce
    Population is a list of tuples, where the first element is the individual and the second
    one is the probability associated to it.
    To avoid generating repeated individuals, 'counter' parameter is used to pick parents in different ways, thus
    generating different individuals
    """
    probabilities = (individual[1] for individual in population)
    individuals = [individual[0] for individual in population]
    parent_1, parent_2 = random.choices(individuals, weights=probabilities, k=2)
    
    return [parent_1, parent_2]


  
def mutate(child, prob=0.2):
    """
    Randomly mutates an individual according to the probability given by prob parameter
    """
    new_child = copy.deepcopy(child)
    for i, char in enumerate(new_child):
        if random.random() < prob:
            new_value = '1' if char == '0' else '0'
            new_child = new_child[:i] + new_value + new_child[i+1:]
    
    return new_child


  
def reproduce(individual_1, individual_2):
    """
    Takes 2 individuals, and combines their information based on a
    randomly chosen crosspoint.
    Each reproduction returns 2 new individuals
    """ 
    # Randomly generate a integer between 1 and the length of the individuals minus one, which will be the crosspoint
    crosspoint = random.randint(1, len(individual_1)-1)
    child_1 = individual_1[:crosspoint] + individual_2[crosspoint:]
    child_2 = individual_2[:crosspoint] + individual_1[crosspoint:]
    child_1, child_2 = mutate(child_1), mutate(child_2)
 
    return [child_1, child_2]


  
def generation_ahead(population, verbose=False):
    """
    Reproduces all the steps for choosing parents and making 
    childs, which means creating a new generation to iterate with
    """
    new_population = list()
    
    for _ in range(int(len(population)//2)):      
        # According to the weights calculated before, choose a set of parents to reproduce
        parents = choose_parents(population)
        if verbose: print(f'Parents chosen: {parents}')
          
        # Reproduce the pair of individuals chose above to generate two new individuals
        childs = reproduce(parents[0], parents[1])
        if verbose: print(f'Generated children: {childs}\n')
        new_population += childs
        
    return new_population



def main_loop(ind_num, dataframe, max_iter=5, verbose=False):
    """
    Performs all the steps of the Genetic Algorithm
    1. Generate random population
    2. Fill population with the weights of each individual
    3. Check if the goal state is reached
    4. Reproduce the population, and create a new generation
    5. Repeat process until termination condition is met
    """
    print(dataframe.shape)
    # Generate individuals (returns a list of strings, where each str represents an individual)
    individuals = generate_random_individuals(ind_num, dataframe.shape[1]-1, dataframe.shape[1]-1, verbose)
    
    # Returns a list of tuples, where each tuple represents an individual and its weight
    population = fill_population(individuals, dataframe, verbose)
    
    # Check if a goal state is reached
    # When goal state is reached, fill_population() function returns a str, otherwise continue
    if isinstance(population, str):
        return population
        
    # Reproduce current generation to generate a better new one
    new_generation = generation_ahead(population, verbose)
    
    # After the new generation is generated, the loop goes on until a solution is found or until the maximum number of
    # iterations are reached
    iteration_count = 0
    while iteration_count < max_iter:
        if verbose: print(f'\n\n\nITERATION NUMBER {iteration_count+1} (Iteration max = {max_iter+1})\n\n\n')
        population = fill_population(new_generation, dataframe, verbose)
        
        # Check if a goal state is reached
        if isinstance(population, str):
            break
        
        new_generation = generation_ahead(population, verbose)   
        iteration_count += 1
        
    return population

## Create and train the model with the complete dataset

In [16]:
dataframe.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


X = dataframe.drop(['SalePrice'], axis= 1)
y = dataframe['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 0.8, random_state=52)

print(X.shape, y.shape)


model = LinearRegression()
model.fit(X_train, y_train)

model.score(X_test, y_test)

(1460, 80) (1460,)


0.8261490228009716

## Create and train the model with the optimized dataset

### First, the relevant features must be selected by applying the GA

In [18]:
start = time.time()
exp_df = pd.read_csv("./processed_data.csv")
final_population = main_loop(100, dataframe, max_iter= 10000, verbose=False)
print('Time elapsed for executing the recursive GA: ', time.time() - start, ' seconds')

(1460, 81)
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 

In [19]:
final_population, final_population.count('1')

('00101011111000010101000111100110101000110111110010011100010010111010001100101001',
 40)

### Finally, leave only the relevant features and train the model

In [20]:
optimized_dataframe = dataframe.loc[:, [True if char == '1' else False for char in final_population+'0']]
optimized_dataframe['SalePrice'] = dataframe['SalePrice']
optimized_dataframe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  optimized_dataframe['SalePrice'] = dataframe['SalePrice']


Unnamed: 0,MSZoning,LotArea,Alley,LotShape,LandContour,Utilities,LotConfig,BldgType,OverallQual,YearBuilt,...,GarageArea,GarageQual,GarageCond,WoodDeckSF,ScreenPorch,PoolArea,MiscFeature,MoSold,SaleCondition,SalePrice
0,3,8450,2,3,3,0,4,0,7,2003,...,548,4,4,0,0,0,4,2,4,208500
1,3,9600,2,3,3,0,2,0,6,1976,...,460,4,4,298,0,0,4,5,4,181500
2,3,11250,2,0,3,0,4,0,7,2001,...,608,4,4,0,0,0,4,9,4,223500
3,3,9550,2,0,3,0,0,0,7,1915,...,642,4,4,0,0,0,4,2,0,140000
4,3,14260,2,0,3,0,2,0,8,2000,...,836,4,4,192,0,0,4,12,4,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,3,7917,2,3,3,0,4,0,6,1999,...,460,4,4,0,0,0,4,8,4,175000
1456,3,13175,2,3,3,0,4,0,6,1978,...,500,4,4,349,0,0,4,2,4,210000
1457,3,9042,2,3,3,0,4,0,7,1941,...,252,4,4,0,0,0,2,5,4,266500
1458,3,9717,2,3,3,0,4,0,5,1950,...,240,4,4,366,0,0,4,4,4,142125


In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


X = optimized_dataframe.drop(['SalePrice'], axis= 1)
y = optimized_dataframe['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 0.8, random_state=52)

print(X.shape, y.shape)


model = LinearRegression()
model.fit(X_train, y_train)

model.score(X_test, y_test)

(1460, 40) (1460,)


0.8749700365253512