# Residential Household Data area prediction
[Data Source](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/N3HGRN)

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob

In [6]:
import re

# Data source location
files = glob.glob('./household_power_consumption/dataverse_files_cleaned/Residential_*.csv')

# sort files in numerical order
files.sort(key=lambda f: int(re.sub('\D', '', f)))

# split dataset in different dataframes
# each file is a residential
residentials = [pd.read_csv(f, sep=',', low_memory=False) for f in files]

print('Amount of different residentials: ' + str(len(residentials)))

Amount of different residentials: 28


In [7]:
# area of residence has to be filled in
residentials[0].head()

Unnamed: 0.1,Unnamed: 0,Date,Time,Latitude,Longitude,House_or_apartment,Area_of_residence,House_is_isolated,Number_of_residents,Heating_type,Solar_panels,Global_active_power
0,0,2012-06-01,01:00:00,49.210722,-126.104206,0,0,0,0,1,0,1.011
1,1,2012-06-01,02:00:00,49.210722,-126.104206,0,0,0,0,1,0,0.451
2,2,2012-06-01,03:00:00,49.210722,-126.104206,0,0,0,0,1,0,0.505
3,3,2012-06-01,04:00:00,49.210722,-126.104206,0,0,0,0,1,0,0.441
4,4,2012-06-01,05:00:00,49.210722,-126.104206,0,0,0,0,1,0,0.468


In [8]:
# HOUSE NR => HOUSETYPE

# 1 => bungalow
# 2 => duplex
# 3 => modern
# 4 => character
# 5 => modern
# 6 => apartment
# 7 => /
# 8 => character
# 9 => special
# 10 => special
# 11 => duplex
# 12 => apartment
# 13 => special
# 14 => modern
# 15 => bungalow
# 16 => apartment
# 17 => apartment
# 18 => bungalow
# 19 => special
# 20 => character
# 21 => laneway
# 22 => apartment
# 23 => apartment
# 24 => modern
# 25 => character
# 26 => apartment
# 27 => apartment
# 28 => special


# bungalow 65-85 m^2
# duplex 70-100 m^2
# modern 90-270 m^2
# character 70-200 m^2
# apartment 60-93 m^2
# special 70-120 m^2
# laneway 59-83 m^2

In [15]:
import random

residentials[0]['Area_of_residence'] = random.randint(65,85) # 1 => bungalow
residentials[1]['Area_of_residence'] = random.randint(70,100) # 2 => duplex
residentials[2]['Area_of_residence'] = random.randint(90,270) # 3 => modern
residentials[3]['Area_of_residence'] = random.randint(70,200) # 4 => character
residentials[4]['Area_of_residence'] = random.randint(90,270) # 5 => modern
residentials[5]['Area_of_residence'] = random.randint(60,93) # 6 => apartment
residentials[6]['Area_of_residence'] = random.randint(100,150) # 7 => /
residentials[7]['Area_of_residence'] = random.randint(70,200) # 8 => character
residentials[8]['Area_of_residence'] = random.randint(70,120) # 9 => special
residentials[9]['Area_of_residence'] = random.randint(70,120) # 10 => special
residentials[10]['Area_of_residence'] = random.randint(70,100) # 11 => duplex
residentials[11]['Area_of_residence'] = random.randint(60,93) # 12 => apartment
residentials[12]['Area_of_residence'] = random.randint(70,120) # 13 => special
residentials[13]['Area_of_residence'] = random.randint(90,270) # 14 => modern
residentials[14]['Area_of_residence'] = random.randint(65,85) # 15 => bungalow
residentials[15]['Area_of_residence'] = random.randint(60,93) # 16 => apartment
residentials[16]['Area_of_residence'] = random.randint(60,93) # 17 => apartment
residentials[17]['Area_of_residence'] = random.randint(65,85) # 18 => bungalow
residentials[18]['Area_of_residence'] = random.randint(70,120) # 19 => special
residentials[19]['Area_of_residence'] = random.randint(70,200) # 20 => character
residentials[20]['Area_of_residence'] = random.randint(59,83) # 21 => laneway
residentials[21]['Area_of_residence'] = random.randint(60,93) # 22 => apartment
residentials[22]['Area_of_residence'] = random.randint(60,93) # 23 => apartment
residentials[23]['Area_of_residence'] = random.randint(90,270) # 24 => modern
residentials[24]['Area_of_residence'] = random.randint(70,200) # 25 => character
residentials[25]['Area_of_residence'] = random.randint(60,93) # 26 => apartment
residentials[26]['Area_of_residence'] = random.randint(60,93) # 27 => apartment
residentials[27]['Area_of_residence'] = random.randint(70,120) # 28 => special

residentials[0].head()

Unnamed: 0.1,Unnamed: 0,Date,Time,Latitude,Longitude,House_or_apartment,Area_of_residence,House_is_isolated,Number_of_residents,Heating_type,Solar_panels,Global_active_power
0,0,2012-06-01,01:00:00,49.210722,-126.104206,0,83,0,0,1,0,1.011
1,1,2012-06-01,02:00:00,49.210722,-126.104206,0,83,0,0,1,0,0.451
2,2,2012-06-01,03:00:00,49.210722,-126.104206,0,83,0,0,1,0,0.505
3,3,2012-06-01,04:00:00,49.210722,-126.104206,0,83,0,0,1,0,0.441
4,4,2012-06-01,05:00:00,49.210722,-126.104206,0,83,0,0,1,0,0.468


In [16]:
for i, _df in enumerate(residentials):
    filename = './household_power_consumption/dataverse_files_enriched/Residential_' + str(i+1) + '.csv'
    print(filename)
    _df.to_csv(filename)

./household_power_consumption/dataverse_files_enriched/Residential_1.csv
./household_power_consumption/dataverse_files_enriched/Residential_2.csv
./household_power_consumption/dataverse_files_enriched/Residential_3.csv
./household_power_consumption/dataverse_files_enriched/Residential_4.csv
./household_power_consumption/dataverse_files_enriched/Residential_5.csv
./household_power_consumption/dataverse_files_enriched/Residential_6.csv
./household_power_consumption/dataverse_files_enriched/Residential_7.csv
./household_power_consumption/dataverse_files_enriched/Residential_8.csv
./household_power_consumption/dataverse_files_enriched/Residential_9.csv
./household_power_consumption/dataverse_files_enriched/Residential_10.csv
./household_power_consumption/dataverse_files_enriched/Residential_11.csv
./household_power_consumption/dataverse_files_enriched/Residential_12.csv
./household_power_consumption/dataverse_files_enriched/Residential_13.csv
./household_power_consumption/dataverse_files_e