# Data Wrangling

Here we perform data wrangling in preparation for the following apartment price analysis and model prediction.

In [1]:
import pandas as pd

## 1. First-layer info

In [128]:
df = pd.read_csv('hemnet.csv')
df = df[df['Types'] == 'Lägenhet']
df = df.drop(columns=['Types', 'Links',])
df.columns = ['Addresses', 'Area', 'RoomCount', 'Avgift', 'SoldDate', 'Prices']
df.head()

Unnamed: 0,Addresses,Area,RoomCount,Avgift,SoldDate,Prices
0,Flormansgatan 2A,43,1.5,2767.0,30 september 2021,2370
1,Kastanjegatan 19F,34,2.0,2415.0,30 september 2021,1745
2,Karl XI gatan 47,874,3.0,5787.0,30 september 2021,4700
4,Margaretavägen 3K,78,3.0,4584.0,30 september 2021,2750
5,Qvantenborgsvägen 4B,59,2.0,3125.0,29 september 2021,2250


In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1972 entries, 0 to 2498
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Addresses  1972 non-null   object 
 1   Area       1972 non-null   object 
 2   RoomCount  1972 non-null   float64
 3   Avgift     1972 non-null   float64
 4   SoldDate   1972 non-null   object 
 5   Prices     1972 non-null   int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 107.8+ KB


No missing values.

### 1.1 Area

In [130]:
# We first clean the values contain '+' sign by removing the number after it. 
# By comparing the sold price and price/m², seems these numbers are not counted. 

irregular_values = df[df['Area'].str.contains('+', regex=False)]['Area']
irregular_values.head()

69       60 + 20
91       44 + 20
154    48,4 + 20
485    75,5 + 21
958      89 + 50
Name: Area, dtype: object

In [131]:
# Remove '+' sign

values = irregular_values.str.split('+').str[0]
values.head()

69       60 
91       44 
154    48,4 
485    75,5 
958      89 
Name: Area, dtype: object

In [132]:
# Replace those irregular values with the correct ones
irregular_index = df[df['Area'].str.contains('+', regex=False)].index.values

for ind in irregular_index:
    df['Area'] = df['Area'].replace(df['Area'][ind], values[ind])

# We also replace comma with period
df['Area'] = df['Area'].str.replace(',','.')

df.head()

Unnamed: 0,Addresses,Area,RoomCount,Avgift,SoldDate,Prices
0,Flormansgatan 2A,43.0,1.5,2767.0,30 september 2021,2370
1,Kastanjegatan 19F,34.0,2.0,2415.0,30 september 2021,1745
2,Karl XI gatan 47,87.4,3.0,5787.0,30 september 2021,4700
4,Margaretavägen 3K,78.0,3.0,4584.0,30 september 2021,2750
5,Qvantenborgsvägen 4B,59.0,2.0,3125.0,29 september 2021,2250


### 1.2 Dates

In [133]:
# Change the format of sold dates

Dates = df['SoldDate']
Dates=Dates.str.replace(' januari ','/01/').str.replace(' februari ','/02/').str.replace(' mars ','/03/').str.replace(' april ','/04/').str.replace(' maj ','/05/').str.replace(' juni ','/06/').str.replace(' juli ','/07/').str.replace(' augusti ','/08/').str.replace(' september ','/09/').str.replace(' oktober ','/10/').str.replace(' november ','/11/').str.replace(' december ','/12/')
Dates.head()

0    30/09/2021
1    30/09/2021
2    30/09/2021
4    30/09/2021
5    29/09/2021
Name: SoldDate, dtype: object

In [134]:
Dates = pd.to_datetime(Dates)
df['SoldDate'] = Dates.values
df.head()

Unnamed: 0,Addresses,Area,RoomCount,Avgift,SoldDate,Prices
0,Flormansgatan 2A,43.0,1.5,2767.0,2021-09-30,2370
1,Kastanjegatan 19F,34.0,2.0,2415.0,2021-09-30,1745
2,Karl XI gatan 47,87.4,3.0,5787.0,2021-09-30,4700
4,Margaretavägen 3K,78.0,3.0,4584.0,2021-09-30,2750
5,Qvantenborgsvägen 4B,59.0,2.0,3125.0,2021-09-29,2250


### 1.3 Re-order

Reorder the rows as the increasing sold prices

In [135]:
df = df.sort_values(['Prices'], ascending=1)
df.head()

Unnamed: 0,Addresses,Area,RoomCount,Avgift,SoldDate,Prices
565,Veberödsvägen 22C,23.0,1.0,1287.0,2021-04-09,750
319,Idalavägen 47 f,50.0,2.0,4011.0,2020-10-21,795
2007,Allégatan 3F,23.5,1.0,1836.0,2020-11-15,800
259,Horstgatan 4H,31.0,1.0,1770.0,2020-10-29,810
2411,Allégatan 3F,23.5,1.0,1786.0,2021-03-01,810


## 2. Second-layer info (info contained in the individual links)

In [52]:
snd_data = pd.read_csv('unprocessed_sndlayer_info.csv')
snd_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971
0,32609kr/m²,15900kr/m²,34043kr/m²,26129kr/m²,34468kr/m²,19318kr/m²,19000kr/m²,31847kr/m²,16667kr/m²,28514kr/m²,...,63194kr/m²,44498kr/m²,44498kr/m²,64901kr/m²,40204kr/m²,63368kr/m²,63905kr/m²,69410kr/m²,69410kr/m²,85124kr/m²
1,695000kr,795000kr,725000kr,795000kr,750000kr,850000kr,975000kr,950000kr,1050000kr,895000kr,...,7995000kr,9300000kr,9300000kr,6995000kr,8500000kr,9500000kr,9500000kr,10000000kr,10000000kr,11600000kr
2,+55000 kr (+8%),,+75000 kr (+10%),+15000 kr (+2%),+60000 kr (+8%),,-25000 kr (-3%),+50000 kr (+5%),,+160000 kr (+18%),...,"+1,11milj. kr (+14%)",,,"+2,81milj. kr (+40%)","+1,35milj. kr (+16%)",+1milj. kr (+11%),"+1,3milj. kr (+14%)","+1,05milj. kr (+11%)","+1,05milj. kr (+11%)",+62000 kr (+1%)
3,Lägenhet,Lägenhet,Lägenhet,Lägenhet,Lägenhet,Lägenhet,Lägenhet,Lägenhet,Lägenhet,Lägenhet,...,Lägenhet,Lägenhet,Lägenhet,Lägenhet,Lägenhet,Lägenhet,Lägenhet,Lägenhet,Lägenhet,Lägenhet
4,Bostadsrätt,Bostadsrätt,Bostadsrätt,Bostadsrätt,Bostadsrätt,Bostadsrätt,Bostadsrätt,Bostadsrätt,Bostadsrätt,Bostadsrätt,...,Bostadsrätt,Bostadsrätt,Bostadsrätt,Andel i bostadsförening,Bostadsrätt,Bostadsrätt,Bostadsrätt,Bostadsrätt,Bostadsrätt,Bostadsrätt


### 2.1 Asking price

In [143]:
asking_price = snd_data.iloc[1].str.replace('kr', '')
asking_price = asking_price.astype(float)/1000
asking_price.head()

0    695.0
1    795.0
2    725.0
3    795.0
4    750.0
Name: 1, dtype: float64

### 2.2 Build year

In [144]:
build_year = [None for _ in range(1972)]

for i in range(1972):
    snd_data_col = snd_data[str(i)]
    for element in snd_data_col:
        # here we need to check if the element is string because we set asking_price data type
        # to be float which does not have length. 
        if type(element)==str and len(element) == 4:
            build_year[i] = int(element)
            
build_year[:6]

[1956, 2004, 1957, 2018, 1957, 1953]

### 2.3 If there's balcony and/or patio

In [145]:
is_balcony_count = [None for _ in range(1972)]
for i in range(1972):
    snd_data_col = snd_data[str(i)]
    Nej_count = list(snd_data_col).count('Nej')
    Ja_count = list(snd_data_col).count('Ja')
    is_balcony_count[i] = Nej_count + Ja_count

print('# of apartment provide neither patio nor balcony info: ', is_balcony_count.count(0))
print('# of apartment provide both patio and balcony info: ', is_balcony_count.count(2))
print('# of apartment only balcony info: ', is_balcony_count.count(1))

# of apartment provide neither patio nor balcony info:  242
# of apartment provide both patio and balcony info:  210
# of apartment only balcony info:  1520


In [146]:
balcony_index = [i for i, e in enumerate(is_balcony_count) if e == 1]
balcony_patio_index = [i for i, e in enumerate(is_balcony_count) if e == 2]

In [147]:
is_balcony = [None for _ in range(1972)]
is_patio = [None for _ in range(1972)]

# For the links which only provide balcony info, there is either 'Ja' or 'Nej'

for ind in balcony_index:
    snd_data_col = snd_data[str(ind)]
    for element in snd_data_col:
        if element == 'Ja' or element == 'Nej':
            is_balcony[ind] = element

for ind in balcony_patio_index:
    snd_data_col = snd_data[str(ind)]
    is_balcony[ind] = snd_data_col[7]
    is_patio[ind] = snd_data_col[8]

print(is_balcony[0:10])
print(is_patio[5:15])

['Nej', None, None, None, None, None, 'Ja', 'Nej', None, None]
[None, None, None, None, None, 'Ja', None, None, None, None]


### 2.4 Floor number/total floor/elevator

In [63]:
floor_elevator = [None for _ in range(1972)]
floor_number = [None for _ in range(1972)]
total_floor = [None for _ in range(1972)]
is_elevator = [None for _ in range(1972)]

for i in range(1972):
    snd_data_col = snd_data[str(i)]
    for element in snd_data_col:
        if type(element)==str and element.count('av') == 1:
            floor_elevator[i] = element

for i in range(1972):
    if floor_elevator[i]:
        element = floor_elevator[i].split(', ')
        
        if len(element) == 2:
            is_elevator[i] = element[1]
            floor_info = element[0].split('av')
            floor_number[i] = floor_info[0].strip()
            total_floor[i] = floor_info[1].strip()
        elif len(element) == 1:
            floor_info = element[0].split('av')
            floor_number[i] = floor_info[0].strip()
            total_floor[i] = floor_info[1].strip()

is_elevator = list(pd.Series(is_elevator).replace('hiss finns ej','No').replace('hiss finns','Yes'))

print(floor_elevator[:5])
print(is_elevator[:5])
print(floor_number[:5])
print(total_floor[:5])


['1 av 3, hiss finns ej', '2 av 2, hiss finns ej', '2 av 2, hiss finns ej', '2 av 2', '1 av 2, hiss finns ej']
['No', 'No', 'No', None, 'No']
['1', '2', '2', '2', '1']
['3', '2', '2', '2', '2']


### 2.5 Agents/Firm

In [148]:
agent_data = pd.read_csv('agent.csv')
agent_data.head()

Unnamed: 0,Agents,Agencies
0,Karin Ekström,Erik Olsson Fastighetsförmedling
1,Rickard Saltin,Fastighetsbyrån Lund
2,,Fastighetsbyrån Lund
3,,Svensk Fastighetsförmedling Lund
4,,Fastighetsbyrån Lund


## Merge into one CSV file

In [151]:
df_final = pd.concat([df.reset_index(), agent_data], axis=1)
df_final = df_final.assign(**{'Asking': asking_price.values, 'Balcony': is_balcony, 'Patio': is_patio,'BuildYear': build_year, 'FloorNumber': floor_number, 'TotalFloor': total_floor, 'Elevator': is_elevator})
df_final.rename(columns = {'Agents':'Broker', 'Agencies':'Firm'}, inplace = True)
df_final.head()


Unnamed: 0,index,Addresses,Area,RoomCount,Avgift,SoldDate,Prices,Broker,Firm,Asking,Balcony,Patio,BuildYear,FloorNumber,TotalFloor,Elevator
0,565,Veberödsvägen 22C,23.0,1.0,1287.0,2021-04-09,750,Karin Ekström,Erik Olsson Fastighetsförmedling,695.0,Nej,,1956.0,1,3,No
1,319,Idalavägen 47 f,50.0,2.0,4011.0,2020-10-21,795,Rickard Saltin,Fastighetsbyrån Lund,795.0,,,2004.0,2,2,No
2,2007,Allégatan 3F,23.5,1.0,1836.0,2020-11-15,800,,Fastighetsbyrån Lund,725.0,,,1957.0,2,2,No
3,259,Horstgatan 4H,31.0,1.0,1770.0,2020-10-29,810,,Svensk Fastighetsförmedling Lund,795.0,,,2018.0,2,2,
4,2411,Allégatan 3F,23.5,1.0,1786.0,2021-03-01,810,,Fastighetsbyrån Lund,750.0,,,1957.0,1,2,No


In [152]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1972 entries, 0 to 1971
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   index        1972 non-null   int64         
 1   Addresses    1972 non-null   object        
 2   Area         1972 non-null   object        
 3   RoomCount    1972 non-null   float64       
 4   Avgift       1972 non-null   float64       
 5   SoldDate     1972 non-null   datetime64[ns]
 6   Prices       1972 non-null   int64         
 7   Broker       1907 non-null   object        
 8   Firm         1972 non-null   object        
 9   Asking       1970 non-null   float64       
 10  Balcony      1730 non-null   object        
 11  Patio        210 non-null    object        
 12  BuildYear    1676 non-null   float64       
 13  FloorNumber  1751 non-null   object        
 14  TotalFloor   1751 non-null   object        
 15  Elevator     1667 non-null   object        
dtypes: dat

In [153]:
df_final.to_csv('apartment.csv')