# Numpy's Limitations Example

In [33]:
house_data = [['Size of House','Price','City'],
              [5000,750,1,1500],
              [2,300000,1000000,300000],
              ['Brecon','Manchester','London','Leeds']]

In [34]:
import numpy as np

In [35]:
np_house_data = np.array(house_data)

In [36]:
print(np_house_data)

[list(['Size of House', 'Price', 'City']) list([5000, 750, 1, 1500])
 list([2, 300000, 1000000, 300000])
 list(['Brecon', 'Manchester', 'London', 'Leeds'])]


In [37]:
house_data = {'Size of House':[5000,750,1,1500],
             'Price':[2,300000,1000000,300000],
             'City':['Brecon','Manchester','London','Leeds']}

np_house_data = np.array(house_data)

print(np_house_data)

{'Size of House': [5000, 750, 1, 1500], 'Price': [2, 300000, 1000000, 300000], 'City': ['Brecon', 'Manchester', 'London', 'Leeds']}


In [38]:
import pandas as pd

In [39]:
pd_house_data = pd.DataFrame(house_data)

In [40]:
print(pd_house_data)

         City    Price  Size of House
0      Brecon        2           5000
1  Manchester   300000            750
2      London  1000000              1
3       Leeds   300000           1500


# How to Set the Index Row in Pandas

In [43]:
pd_house_data.index = 1,2,3,4

In [44]:
print(pd_house_data)

         City    Price  Size of House
1      Brecon        2           5000
2  Manchester   300000            750
3      London  1000000              1
4       Leeds   300000           1500


In [95]:
pd_house_data.index = 'BRE','MAN','LON','LEE'
print(pd_house_data)

           City    Price  Size of House
BRE      Brecon        2           5000
MAN  Manchester   300000            750
LON      London  1000000              1
LEE       Leeds   300000           1500


# Importing Data with Pandas

In [50]:
new_house_data = pd.read_csv('house_data.csv')

In [51]:
print(new_house_data)

   Unnamed: 0        City    Price  Size of House
0           1      Brecon        2           5000
1           2  Manchester   300000            750
2           3      London  1000000              1
3           4       Leeds   300000           1500


In [52]:
new_house_data = pd.read_csv('house_data.csv', index_col = 0)

In [53]:
print(new_house_data)

         City    Price  Size of House
1      Brecon        2           5000
2  Manchester   300000            750
3      London  1000000              1
4       Leeds   300000           1500


# Exercise 1
## 1. Look at the table about cars below 
|      |Colour |Year
|------|-------|-----
|  0   | green |2016
|------|-------|------
|  1   | blue  |1994
|------|-------|-----
|  2   | red   |2001
|------|-------|------
|  3   |yellow |1987
|------|-------|-----
|  4   |orange |2003




## a. Make it into a dictionary called cars
## b. Convert it to a pandas data frame save it in a variable pd_cars
## c. Change the index values using the .index method to GR, BL, RE, YE, OR

## 2. Open the pandas csv file as save it as the variable pandas
## b. Format it such that the second column is the index column
## c. Change the index column to Yi, Ch, La, Sa, We, Ju


In [234]:
cars = {'Colour': ['green', 'blue','red','yellow','orange'],
        'Year':[2016,1994,2001,1987,2003],
       }

In [235]:
pd_cars = pd.DataFrame(cars)

In [236]:
print(pd_cars)

   Colour  Year
0   green  2016
1    blue  1994
2     red  2001
3  yellow  1987
4  orange  2003


In [237]:
pd_cars.index = 'GR', 'BL', 'RE', 'YE', 'OR'

In [238]:
print(pd_cars)

    Colour  Year
GR   green  2016
BL    blue  1994
RE     red  2001
YE  yellow  1987
OR  orange  2003


In [239]:
pandas = pd.read_csv('pandas.csv')

In [240]:
print(pandas)

   Age Unnamed: 1  Weight
0    2       Ying     1.8
1    5       Chao     3.4
2   14        Lan     8.7
3    3        San     2.2
4    7        Wei     4.3
5    1        Jun     0.9


In [241]:
pandas = pd.read_csv('pandas.csv', index_col = 1)

In [242]:
print(pandas)

      Age  Weight
Ying    2     1.8
Chao    5     3.4
Lan    14     8.7
San     3     2.2
Wei     7     4.3
Jun     1     0.9


In [243]:
pandas.index = 'Yi', 'Ch', 'La', 'Sa', 'We', 'Ju'

In [244]:
print(pandas)

    Age  Weight
Yi    2     1.8
Ch    5     3.4
La   14     8.7
Sa    3     2.2
We    7     4.3
Ju    1     0.9


# Accessing Data From Pandas Data Frames

### Suppose we only wanted to see the size of the houses how could we only access the size of house column?


In [62]:
print(new_house_data['Size of House'])

1    5000
2     750
3       1
4    1500
Name: Size of House, dtype: int64


In [63]:
type(new_house_data['Size of House'])

pandas.core.series.Series

In [64]:
print(new_house_data[['Size of House']])

   Size of House
1           5000
2            750
3              1
4           1500


In [65]:
type(new_house_data[['Size of House']])

pandas.core.frame.DataFrame

### How can we select more than one column?

In [66]:
print(new_house_data[['Size of House', 'City']])

   Size of House        City
1           5000      Brecon
2            750  Manchester
3              1      London
4           1500       Leeds


### How to subset rows in pandas?

In [70]:
print(new_house_data['London']) #slice

     City    Price  Size of House
3  London  1000000              1


In [245]:
print(new_house_data[2:3]) 

     City    Price  Size of House
3  London  1000000              1


# loc and iloc

## loc is label based, iloc is postion based

In [74]:
print(new_house_data)

         City    Price  Size of House
1      Brecon        2           5000
2  Manchester   300000            750
3      London  1000000              1
4       Leeds   300000           1500


In [80]:
print(new_house_data.loc[1]) #double brackets

City             Brecon
Price                 2
Size of House      5000
Name: 1, dtype: object


In [81]:
print(pd_house_data)

         City    Price  Size of House
W      Brecon        2           5000
E  Manchester   300000            750
E      London  1000000              1
E       Leeds   300000           1500


In [96]:
print(pd_house_data.loc[['BRE']])

       City  Price  Size of House
BRE  Brecon      2           5000


In [97]:
print(new_house_data.iloc[[0]])

     City  Price  Size of House
1  Brecon      2           5000


In [98]:
print(pd_house_data.iloc[[0]])

       City  Price  Size of House
BRE  Brecon      2           5000


### Selecting multiple rows

In [100]:
print(new_house_data.loc[[2,4]])

         City   Price  Size of House
2  Manchester  300000            750
4       Leeds  300000           1500


In [89]:
print(new_house_data.iloc[[1,3]])

         City   Price  Size of House
2  Manchester  300000            750
4       Leeds  300000           1500


In [90]:
print(new_house_data.loc[[2,4],['City','Size of House']])

         City  Size of House
2  Manchester            750
4       Leeds           1500


In [92]:
print(new_house_data.iloc[[1,3],[0,2]])

         City  Size of House
2  Manchester            750
4       Leeds           1500


In [93]:
print(new_house_data.loc[:,['City','Size of House']])

         City  Size of House
1      Brecon           5000
2  Manchester            750
3      London              1
4       Leeds           1500


In [94]:
print(new_house_data.iloc[:,[0,2]])

         City  Size of House
1      Brecon           5000
2  Manchester            750
3      London              1
4       Leeds           1500


# Exercise 2

## 1. Open the countries4 csv file and save it as the variable countries
## Make sure to put one space after the country when indexing e.g. 'Argentina' should be 'Argentina  '.
## 2. What is the name of the 27th country on the list (hint it's at index 26)?
## 3. Which country has a higher population, bulgaria or colombia?
## 4. What is the area of Burkina Faso? (leave a space after area i.e 'Area ')


# Answers
## 1. countries = pd.read_csv('countries4.csv', index_col = 0)
## 2. Brunei
## 3. Colombia
## 4. 274200

In [182]:
countries = pd.read_csv('countries4.csv', index_col = 0)

# Filtering Pandas Data Frames
## Suppose we wanted to know all the countries whose area was less than 1000

In [185]:
countries['Area '] 

Afghanistan               647500
Albania                    28748
Algeria                  2381740
Anguilla                     102
Antigua & Barbuda            443
Argentina                2766890
Armenia                    29800
Aruba                        193
Australia                7686850
Austria                    83870
Azerbaijan                 86600
Bahamas, The               13940
Bahrain                      665
Bangladesh                144000
Barbados                     431
Belarus                   207600
Belgium                    30528
Belize                     22966
Benin                     112620
Bermuda                       53
Bhutan                     47000
Bolivia                  1098580
Bosnia & Herzegovina       51129
Botswana                  600370
Brazil                   8511965
British Virgin Is.           153
Brunei                      5770
Bulgaria                  110910
Burkina Faso              274200
Burma                     678500
Burundi   

In [192]:
small = countries['Area '] < 1000

In [193]:
countries[small]

Unnamed: 0,Population,Area,Pop. Density
Anguilla,13477,102,132.1
Antigua & Barbuda,69108,443,156.0
Aruba,71891,193,372.5
Bahrain,698585,665,1050.5
Barbados,279912,431,649.5
Bermuda,65773,53,1241.0
British Virgin Is.,23098,153,151.0
Cayman Islands,45436,262,173.4


In [194]:
countries[countries['Area '] < 1000]

Unnamed: 0,Population,Area,Pop. Density
Anguilla,13477,102,132.1
Antigua & Barbuda,69108,443,156.0
Aruba,71891,193,372.5
Bahrain,698585,665,1050.5
Barbados,279912,431,649.5
Bermuda,65773,53,1241.0
British Virgin Is.,23098,153,151.0
Cayman Islands,45436,262,173.4


## Suppose I wanted to find the countries with areas less than 1000 and populations greater than 200,000

In [200]:
dense = np.logical_and(countries['Area '] < 1000, countries['Population'] > 200000 )

In [201]:
print(countries[dense])

           Population  Area   Pop. Density 
Bahrain        698585    665         1050.5
Barbados       279912    431          649.5


In [203]:
print(countries[np.logical_and(countries['Area '] < 1000, countries['Population'] > 200000 )])

           Population  Area   Pop. Density 
Bahrain        698585    665         1050.5
Barbados       279912    431          649.5


# Exercise 3
## 1. How many countries have a population density ('Pop. Density ') over or an area 1000 ? (use the inbuilt len function)
## 2. Which country has an area of exactly 53? (Hint use == )
## 3.  Which country has a population of more than 9million but less than 10million and an area of more than 1 million but less than 1.5 million

In [209]:
print(len(countries[np.logical_or(countries['Pop. Density '] >1000, countries['Area '] >1000)]))

38


In [211]:
print(countries[countries['Area ']==53])

          Population  Area   Pop. Density 
Bermuda        65773     53         1241.0


In [218]:
pop_array = np.logical_and(countries['Population'] >9000000, countries['Population'] <10000000)

In [224]:
area_array = np.logical_and(countries['Area '] >1000000, countries['Area '] <1500000) 

In [225]:
countries[np.logical_and(pop_array,area_array)]

Unnamed: 0,Population,Area,Pop. Density
Chad,9944201,1284000,7.7
