In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
house_df = pd.read_csv('Housing.csv')

In [11]:
house_df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,basement bool,total bad & bath
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished,False,6
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished,False,8
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished,True,5
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished,True,6
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished,True,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished,True,3
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished,False,4
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished,False,3
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished,False,4


Below we see how to create a new variable which is simply a boolean for a pre-existing variable

In [7]:
house_df['basement bool'] = house_df['basement'] == 'yes'

In [8]:
house_df['basement bool']

0      False
1      False
2       True
3       True
4       True
       ...  
540     True
541    False
542    False
543    False
544    False
Name: basement bool, Length: 545, dtype: bool

In [9]:
house_df['total bad & bath'] = house_df['bedrooms'] + house_df['bathrooms']

In [10]:
house_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,basement bool,total bad & bath
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished,False,6
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished,False,8
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished,True,5
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished,True,6
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished,True,5


Let's say we want to create a new variable that notes whether it has all 3 of basement, aircon and guestroom..

In [12]:
house_df['all three'] = (house_df['basement'] == house_df['guestroom'] == house_df['airconditioning']) == 'yes'

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [13]:
house_df['guestroom bool'] = house_df['guestroom'] == 'yes'

In [14]:
house_df['airconditioning bool'] = house_df['airconditioning'] == 'yes'

In [15]:
pwd

'C:\\Users\\wedwa\\OneDrive\\Documents\\5th_wave_cantek'

In [16]:
house_df['all three'] = house_df['airconditioning bool'] == house_df['guestroom bool'] 

The above code does the wrong thing. Doesn't consider all columns, and it will evaluate to True if both of them are False.

In [18]:
house_df['all three'] = np.where((house_df['basement'] == house_df['guestroom'] == house_df['airconditioning']) == 'yes', True, False)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [19]:
house_df['basement'] = np.where(house_df['basement'] == 'yes', 1, 0)

In [20]:
house_df['basement']

0      0
1      0
2      1
3      1
4      1
      ..
540    1
541    0
542    0
543    0
544    0
Name: basement, Length: 545, dtype: int32

In [21]:
house_df['guestroom'] = np.where(house_df['guestroom'] == 'yes', 1, 0)

In [22]:
house_df['airconditioning'] = np.where(house_df['airconditioning'] == 'yes', 1, 0)

In [23]:
house_df['airconditioning']

0      1
1      1
2      0
3      1
4      1
      ..
540    0
541    0
542    0
543    0
544    0
Name: airconditioning, Length: 545, dtype: int32

In [34]:
house_df['all three'] = np.where(house_df['basement'] + house_df['guestroom'] + house_df['airconditioning'] == 3, True, False)

In [35]:
house_df['all three']

0      False
1      False
2      False
3      False
4       True
       ...  
540    False
541    False
542    False
543    False
544    False
Name: all three, Length: 545, dtype: bool

The below won't work with booleans because of how they behave in Pandas

In [29]:
house_df['all three'] = np.where((house_df['basement bool'] + house_df['guestroom bool'] + house_df['airconditioning bool']), True, False)

In [31]:
house_df['all three']

0      False
1      False
2      False
3      False
4      False
       ...  
540    False
541    False
542    False
543    False
544    False
Name: all three, Length: 545, dtype: bool

In [32]:
house_df.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,basement bool,total bad & bath,guestroom bool,airconditioning bool,all three
0,13300000,7420,4,2,3,yes,0,0,no,1,2,yes,furnished,False,6,False,True,False
1,12250000,8960,4,4,4,yes,0,0,no,1,3,no,furnished,False,8,False,True,False
2,12250000,9960,3,2,2,yes,0,1,no,0,2,yes,semi-furnished,True,5,False,False,False
3,12215000,7500,4,2,2,yes,0,1,no,1,3,yes,furnished,True,6,False,True,False
4,11410000,7420,4,1,2,yes,1,1,no,1,2,no,furnished,True,5,True,True,False
5,10850000,7500,3,3,1,yes,0,1,no,1,2,yes,semi-furnished,True,6,False,True,False
6,10150000,8580,4,3,4,yes,0,0,no,1,2,yes,semi-furnished,False,7,False,True,False
7,10150000,16200,5,3,2,yes,0,0,no,0,0,no,unfurnished,False,8,False,False,False
8,9870000,8100,4,1,2,yes,1,1,no,1,2,yes,furnished,True,5,True,True,False
9,9800000,5750,3,2,4,yes,1,0,no,1,1,yes,unfurnished,False,5,True,True,False


In [33]:
house_df['all three']=house_df['basement bool'] + house_df['guestroom bool'] + house_df['airconditioning bool']
house_df.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,basement bool,total bad & bath,guestroom bool,airconditioning bool,all three
0,13300000,7420,4,2,3,yes,0,0,no,1,2,yes,furnished,False,6,False,True,True
1,12250000,8960,4,4,4,yes,0,0,no,1,3,no,furnished,False,8,False,True,True
2,12250000,9960,3,2,2,yes,0,1,no,0,2,yes,semi-furnished,True,5,False,False,True
3,12215000,7500,4,2,2,yes,0,1,no,1,3,yes,furnished,True,6,False,True,True
4,11410000,7420,4,1,2,yes,1,1,no,1,2,no,furnished,True,5,True,True,True
5,10850000,7500,3,3,1,yes,0,1,no,1,2,yes,semi-furnished,True,6,False,True,True
6,10150000,8580,4,3,4,yes,0,0,no,1,2,yes,semi-furnished,False,7,False,True,True
7,10150000,16200,5,3,2,yes,0,0,no,0,0,no,unfurnished,False,8,False,False,False
8,9870000,8100,4,1,2,yes,1,1,no,1,2,yes,furnished,True,5,True,True,True
9,9800000,5750,3,2,4,yes,1,0,no,1,1,yes,unfurnished,False,5,True,True,True


Normally Booleans in Python behave like so....

In [27]:
False == False

True

In [28]:
True+True

2

In [29]:
True == 1

True

In [30]:
True == 'a'

False

In [31]:
False == 0

True

In [32]:
(True+True)**2

4

In [33]:
True + True

2

In [34]:
True*False

0

In [36]:
house_df['price to area'] = house_df['price']/house_df['area']

In [37]:
house_df['price to area']

0      1792.452830
1      1367.187500
2      1229.919679
3      1628.666667
4      1537.735849
          ...     
540     606.666667
541     736.312500
542     483.425414
543     601.374570
544     454.545455
Name: price to area, Length: 545, dtype: float64

In [38]:
house_df['room sum'] = house_df['bedrooms'] + house_df['bathrooms'] + house_df['guestroom']

In [39]:
house_df['room sum']

0      6
1      8
2      5
3      6
4      6
      ..
540    3
541    4
542    3
543    4
544    4
Name: room sum, Length: 545, dtype: int64

In [40]:
house_df['area_evaluation'] = house_df['area'].apply(lambda x: 'large' if x > 5000 else 'small')

In [41]:
house_df['area_evaluation']

0      large
1      large
2      large
3      large
4      large
       ...  
540    small
541    small
542    small
543    small
544    small
Name: area_evaluation, Length: 545, dtype: object

More lambda....

In [56]:
li = [5, 7, 22, 97, 54, 62, 77, 23, 73, 61]
 
final_list = list(filter(lambda x: (x % 2 != 0), li))
print(final_list)

 


[5, 7, 97, 77, 23, 73, 61]


In [63]:
li = [5, 7, 22, 97, 54, 62, 77, 23, 73, 61]
def return_odd(x):
    if x%2 != 0:
        return(x)
    
def return_odd(x):
    return x%2 != 0
    
final_list = list(filter(return_odd, li))
print(final_list)


[5, 7, 97, 77, 23, 73, 61]


In [61]:
def large_or_small(x):
    if x > 5000:
        return 'large'
    else:
        return 'small'


house_df['area_evaluation'] = house_df['area'].apply(large_or_small)
house_df['area_evaluation']

0      large
1      large
2      large
3      large
4      large
       ...  
540    small
541    small
542    small
543    small
544    small
Name: area_evaluation, Length: 545, dtype: object

'GEEKSFORGEEKS'