##### Pandas DataFrame //
Create DataFrame *Manually*

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

In [2]:
# Create a DataFrame through a two-dimensional array
df1 = pd.DataFrame([
    ['joe', 'san', 'andy'], 
    [70, 80, 90],
])
df1

Unnamed: 0,0,1,2
0,joe,san,andy
1,70,80,90


In [3]:
# Create a DataFrame with index
df2 = pd.DataFrame([
    ['joe', 'san', 'andy'], 
    [70, 80, 90],
], index = ['first', 'second'],     # index parameter for ROW index
    columns =['a', 'b', 'c'] )      # columns parameter for COLUMN index

df2

Unnamed: 0,a,b,c
first,joe,san,andy
second,70,80,90


In [4]:
df2.index

Index(['first', 'second'], dtype='object')

In [5]:
df2.columns

Index(['a', 'b', 'c'], dtype='object')

In [6]:
df2.values

array([['joe', 'san', 'andy'],
       [70, 80, 90]], dtype=object)

In [7]:
# Create a DataFrame through a dictionary
df3 = pd.DataFrame({
    'name': ['joe', 'san', 'andy'],     # 'key' becomes COLUMN index
    'sex': ['male', 'male', 'female'],
    'age':[10, 20, 30],
    'id': 3,
}, index = ['first', 'second', 'third']) # "index" parameter defines ROW index
df3

Unnamed: 0,name,sex,age,id
first,joe,male,10,3
second,san,male,20,3
third,andy,female,30,3


In [8]:
df3.index

Index(['first', 'second', 'third'], dtype='object')

In [9]:
df3.columns

Index(['name', 'sex', 'age', 'id'], dtype='object')

In [10]:
df3.values

array([['joe', 'male', 10, 3],
       ['san', 'male', 20, 3],
       ['andy', 'female', 30, 3]], dtype=object)

In [11]:
df3['name']

first      joe
second     san
third     andy
Name: name, dtype: object

get data and crud operations

In [12]:
# append a column
df3['city'] = ['beijing', 'shanghai', 'chengdu']
df3

Unnamed: 0,name,sex,age,id,city
first,joe,male,10,3,beijing
second,san,male,20,3,shanghai
third,andy,female,30,3,chengdu


In [13]:
# remove a column
df3.pop('city')

first      beijing
second    shanghai
third      chengdu
Name: city, dtype: object

In [14]:
df3


Unnamed: 0,name,sex,age,id
first,joe,male,10,3
second,san,male,20,3
third,andy,female,30,3


In [15]:
df3['age'] = [18,19,20]
df3

Unnamed: 0,name,sex,age,id
first,joe,male,18,3
second,san,male,19,3
third,andy,female,20,3


In [16]:
df3.loc['second']

name     san
sex     male
age       19
id         3
Name: second, dtype: object

In [17]:
df3.loc['second',['age', 'sex']]

age      19
sex    male
Name: second, dtype: object

In [18]:
df3.loc['fourth'] = ['peter', 'female', 22, 4]  # insert a row

In [19]:
df3

Unnamed: 0,name,sex,age,id
first,joe,male,18,3
second,san,male,19,3
third,andy,female,20,3
fourth,peter,female,22,4


In [20]:
df3.drop('fourth')   # remove a row

Unnamed: 0,name,sex,age,id
first,joe,male,18,3
second,san,male,19,3
third,andy,female,20,3


In [21]:
weather_data = {
    'day': ['1/1/2020', '1/2/2020', '1/3/2020', '1/4/2020', '1/5/2020', '1/6/2020'],
    'temperature': [32, 35, 28, 24, 32, 31],
    'windspeed': [6,7,2,7,4,2],
    'event': ['Rain', 'Sunny', 'Snow', 'Snow', 'Rain', 'Windy']
}
df4 = pd.DataFrame(weather_data)
df4.head(6)

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2020,32,6,Rain
1,1/2/2020,35,7,Sunny
2,1/3/2020,28,2,Snow
3,1/4/2020,24,7,Snow
4,1/5/2020,32,4,Rain
5,1/6/2020,31,2,Windy


In [22]:
df4.shape

(6, 4)

In [23]:
df4.tail()

Unnamed: 0,day,temperature,windspeed,event
1,1/2/2020,35,7,Sunny
2,1/3/2020,28,2,Snow
3,1/4/2020,24,7,Snow
4,1/5/2020,32,4,Rain
5,1/6/2020,31,2,Windy


In [24]:
df4.iloc[-1, :]

day            1/6/2020
temperature          31
windspeed             2
event             Windy
Name: 5, dtype: object

In [25]:
df4.columns

Index(['day', 'temperature', 'windspeed', 'event'], dtype='object')

In [26]:
df4.event

0     Rain
1    Sunny
2     Snow
3     Snow
4     Rain
5    Windy
Name: event, dtype: object

In [27]:
df4.dtypes

day            object
temperature     int64
windspeed       int64
event          object
dtype: object

In [28]:
df4 = df4[['day', 'event', 'windspeed', 'temperature']]
df4.head()

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,7,35
2,1/3/2020,Snow,2,28
3,1/4/2020,Snow,7,24
4,1/5/2020,Rain,4,32


In [29]:
print(df4.describe())

       windspeed  temperature
count   6.000000     6.000000
mean    4.666667    30.333333
std     2.338090     3.829708
min     2.000000    24.000000
25%     2.500000    28.750000
50%     5.000000    31.500000
75%     6.750000    32.000000
max     7.000000    35.000000


#### Pandas Select

In [30]:
df4.loc[df4.temperature == df4.temperature.max(), :]

Unnamed: 0,day,event,windspeed,temperature
1,1/2/2020,Sunny,7,35


In [31]:
df4[df4.temperature >= 32]

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,7,35
4,1/5/2020,Rain,4,32


In [32]:
df4[df4.temperature == df4.temperature.max()]

Unnamed: 0,day,event,windspeed,temperature
1,1/2/2020,Sunny,7,35


In [33]:
df4.loc[df4['temperature'] == df4['temperature'].max(), ['day', 'windspeed', 'event']]

Unnamed: 0,day,windspeed,event
1,1/2/2020,7,Sunny


#### Pandas Indexing

In [34]:
data = pd.Series(['a','b', 'c'], index = [1,3,5])
print(data[5])      #explicit index, use loc instead
print(data[1:3])    #implicit index, use iloc instead


c
3    b
5    c
dtype: object


In [35]:
df4

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,7,35
2,1/3/2020,Snow,2,28
3,1/4/2020,Snow,7,24
4,1/5/2020,Rain,4,32
5,1/6/2020,Windy,2,31


In [36]:
df4.set_index('day', inplace = True)
df4

Unnamed: 0_level_0,event,windspeed,temperature
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2020,Rain,6,32
1/2/2020,Sunny,7,35
1/3/2020,Snow,2,28
1/4/2020,Snow,7,24
1/5/2020,Rain,4,32
1/6/2020,Windy,2,31


In [37]:
df4.loc['1/4/2020', :]

event          Snow
windspeed         7
temperature      24
Name: 1/4/2020, dtype: object

In [38]:
df4.reset_index(inplace = True)
df4

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,7,35
2,1/3/2020,Snow,2,28
3,1/4/2020,Snow,7,24
4,1/5/2020,Rain,4,32
5,1/6/2020,Windy,2,31


In [39]:
df4.set_index('event', inplace = True)
df4

Unnamed: 0_level_0,day,windspeed,temperature
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rain,1/1/2020,6,32
Sunny,1/2/2020,7,35
Snow,1/3/2020,2,28
Snow,1/4/2020,7,24
Rain,1/5/2020,4,32
Windy,1/6/2020,2,31


In [40]:
df4.loc['Snow', :]

Unnamed: 0_level_0,day,windspeed,temperature
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Snow,1/3/2020,2,28
Snow,1/4/2020,7,24


In [41]:
df4.reset_index(inplace = True)
df4

Unnamed: 0,event,day,windspeed,temperature
0,Rain,1/1/2020,6,32
1,Sunny,1/2/2020,7,35
2,Snow,1/3/2020,2,28
3,Snow,1/4/2020,7,24
4,Rain,1/5/2020,4,32
5,Windy,1/6/2020,2,31


##### Import Data from CSV

In [42]:
filepath = "./data/FAO_database.csv"
data = pd.read_csv(filepath, encoding = "ISO-8859-1")
type(data)

pandas.core.frame.DataFrame

In [43]:
# preview the data
data.head()


Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,...,Y2009,Y2010,Y2011,Y2012,Y2013
0,AF,2,Afghanistan,2511,Wheat and products,...,4538.0,4605.0,4711.0,4810,4895
1,AF,2,Afghanistan,2805,Rice (Milled Equivalent),...,415.0,442.0,476.0,425,422
2,AF,2,Afghanistan,2513,Barley and products,...,379.0,315.0,203.0,367,360
3,AF,2,Afghanistan,2513,Barley and products,...,55.0,60.0,72.0,78,89
4,AF,2,Afghanistan,2514,Maize and products,...,195.0,178.0,191.0,200,200


In [44]:
data.shape

(21477, 63)

In [45]:
data.ndim

2

In [46]:
# show columns data type
data.dtypes

Area Abbreviation     object
Area Code              int64
Area                  object
Item Code              int64
Item                  object
                      ...   
Y2009                float64
Y2010                float64
Y2011                float64
Y2012                  int64
Y2013                  int64
Length: 63, dtype: object

In [47]:
# change column 'Item Code' to str type
data['Item Code'].astype(str)

0        2511
1        2805
2        2513
3        2513
4        2514
         ... 
21472    2948
21473    2960
21474    2960
21475    2961
21476    2928
Name: Item Code, Length: 21477, dtype: object

In [48]:
# show summary information
data['Y2013'].describe()   # float64 summary information

count     21477.000000
mean        575.557480
std        6218.379479
min        -246.000000
25%           0.000000
50%           8.000000
75%          90.000000
max      489299.000000
Name: Y2013, dtype: float64

In [49]:
data['Area'].describe()   # obj summary information

count     21477
unique      174
top       Spain
freq        150
Name: Area, dtype: object

In [50]:
data.describe()   # return numeric columns' summary info

Unnamed: 0,Area Code,Item Code,Element Code,latitude,longitude,...,Y2009,Y2010,Y2011,Y2012,Y2013
count,21477.0,21477.0,21477.0,21477.0,21477.0,...,21373.0,21373.0,21373.0,21477.0,21477.0
mean,125.449411,2694.211529,5211.687154,20.450613,15.794445,...,524.581996,535.492069,553.399242,560.569214,575.55748
std,72.868149,148.973406,146.820079,24.628336,66.012104,...,5545.939303,5721.089425,5883.071604,6047.950804,6218.379479
min,1.0,2511.0,5142.0,-40.9,-172.1,...,0.0,0.0,0.0,-169.0,-246.0
25%,63.0,2561.0,5142.0,6.43,-11.78,...,0.0,0.0,0.0,0.0,0.0
50%,120.0,2640.0,5142.0,20.59,19.15,...,7.0,7.0,8.0,8.0,8.0
75%,188.0,2782.0,5142.0,41.15,46.87,...,83.0,83.0,86.0,88.0,90.0
max,276.0,2961.0,5521.0,64.96,179.41,...,434724.0,451838.0,462696.0,479028.0,489299.0


##### Select Columns
1. data.column_name
2. data['column_name']
3. data.iloc[:,<column_number>]
4. data.loc[:, ['comumn_name', ...]]

In [51]:
# data.column_name
data.Area.head()

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: Area, dtype: object

In [52]:
# data['column_name']
data[['Area', 'Item']].head()

Unnamed: 0,Area,Item
0,Afghanistan,Wheat and products
1,Afghanistan,Rice (Milled Equivalent)
2,Afghanistan,Barley and products
3,Afghanistan,Barley and products
4,Afghanistan,Maize and products


In [53]:
# data['column_name'] get a Series, then can Series operation
ret = [data['Y2013'].sum(),data['Y2013'].mean(),data['Y2013'].median(),
      data['Y2013'].nunique(),data['Y2013'].max(),data['Y2013'].min(),]
ret

[12361248, 575.5574800949853, 8.0, 2107, 489299, -246]

In [54]:
# data.iloc[...]
data.iloc[:,[0,1,3,5]]

Unnamed: 0,Area Abbreviation,Area Code,Item Code,Element Code
0,AF,2,2511,5142
1,AF,2,2805,5142
2,AF,2,2513,5521
3,AF,2,2513,5142
4,AF,2,2514,5521
...,...,...,...,...
21472,ZW,181,2948,5142
21473,ZW,181,2960,5521
21474,ZW,181,2960,5142
21475,ZW,181,2961,5142


In [55]:
data.iloc[:,[1,2]].head()

Unnamed: 0,Area Code,Area
0,2,Afghanistan
1,2,Afghanistan
2,2,Afghanistan
3,2,Afghanistan
4,2,Afghanistan


##### Select Rows
1. data.iloc[0:10, :]
2. data.loc['index', :]
3. conditional selection

In [56]:
# data.iloc[,]
data.iloc[[1,2],:].head()

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,...,Y2009,Y2010,Y2011,Y2012,Y2013
1,AF,2,Afghanistan,2805,Rice (Milled Equivalent),...,415.0,442.0,476.0,425,422
2,AF,2,Afghanistan,2513,Barley and products,...,379.0,315.0,203.0,367,360


In [57]:
# data.loc['index', ]
data.loc[2,:]        # select the row with index == 2

Area Abbreviation                     AF
Area Code                              2
Area                         Afghanistan
Item Code                           2513
Item                 Barley and products
                            ...         
Y2009                                379
Y2010                                315
Y2011                                203
Y2012                                367
Y2013                                360
Name: 2, Length: 63, dtype: object

In [58]:
# conditional selection
data[ data['Area'] == 'Ireland'].head()

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,...,Y2009,Y2010,Y2011,Y2012,Y2013
9533,IE,104,Ireland,2511,Wheat and products,...,804.0,783.0,760.0,650,600
9534,IE,104,Ireland,2511,Wheat and products,...,493.0,512.0,502.0,494,500
9535,IE,104,Ireland,2805,Rice (Milled Equivalent),...,4.0,4.0,4.0,4,4
9536,IE,104,Ireland,2805,Rice (Milled Equivalent),...,15.0,16.0,14.0,14,14
9537,IE,104,Ireland,2513,Barley and products,...,1290.0,1283.0,1182.0,1146,1380


In [59]:
# the following two have the same results
data['latitude'] >= 0
data.loc[:, 'latitude'] >= 0

0         True
1         True
2         True
3         True
4         True
         ...  
21472    False
21473    False
21474    False
21475    False
21476    False
Name: latitude, Length: 21477, dtype: bool

In [60]:
# same results
data[data['latitude'] >= 0]
data[data.loc[:,'latitude'] >= 0]

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,...,Y2009,Y2010,Y2011,Y2012,Y2013
0,AF,2,Afghanistan,2511,Wheat and products,...,4538.0,4605.0,4711.0,4810,4895
1,AF,2,Afghanistan,2805,Rice (Milled Equivalent),...,415.0,442.0,476.0,425,422
2,AF,2,Afghanistan,2513,Barley and products,...,379.0,315.0,203.0,367,360
3,AF,2,Afghanistan,2513,Barley and products,...,55.0,60.0,72.0,78,89
4,AF,2,Afghanistan,2514,Maize and products,...,195.0,178.0,191.0,200,200
...,...,...,...,...,...,...,...,...,...,...,...
21231,YE,249,Yemen,2948,Milk - Excluding Butter,...,984.0,738.0,854.0,1050,1073
21232,YE,249,Yemen,2960,"Fish, Seafood",...,0.0,0.0,0.0,0,0
21233,YE,249,Yemen,2960,"Fish, Seafood",...,78.0,69.0,60.0,60,60
21234,YE,249,Yemen,2961,"Aquatic Products, Other",...,0.0,0.0,0.0,0,0


In [61]:
data.loc[data['latitude'] >= 0, 'Area':]

Unnamed: 0,Area,Item Code,Item,Element Code,Element,...,Y2009,Y2010,Y2011,Y2012,Y2013
0,Afghanistan,2511,Wheat and products,5142,Food,...,4538.0,4605.0,4711.0,4810,4895
1,Afghanistan,2805,Rice (Milled Equivalent),5142,Food,...,415.0,442.0,476.0,425,422
2,Afghanistan,2513,Barley and products,5521,Feed,...,379.0,315.0,203.0,367,360
3,Afghanistan,2513,Barley and products,5142,Food,...,55.0,60.0,72.0,78,89
4,Afghanistan,2514,Maize and products,5521,Feed,...,195.0,178.0,191.0,200,200
...,...,...,...,...,...,...,...,...,...,...,...
21231,Yemen,2948,Milk - Excluding Butter,5142,Food,...,984.0,738.0,854.0,1050,1073
21232,Yemen,2960,"Fish, Seafood",5521,Feed,...,0.0,0.0,0.0,0,0
21233,Yemen,2960,"Fish, Seafood",5142,Food,...,78.0,69.0,60.0,60,60
21234,Yemen,2961,"Aquatic Products, Other",5142,Food,...,0.0,0.0,0.0,0,0


##### Iterate DataFrame

In [62]:
for index, row in data.iterrows():
    print(row[0])

AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
AO
AO
AO
A

CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CA
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
CF
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
C

FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FI
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
FR
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
PF
GA
G

JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JM
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JP
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
JO
J

MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MZ
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
MM
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
NP


WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
WS
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
ST
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SA
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
SN
S

TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
TZ
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
US
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
UY
U

##### Delete columns

In [63]:
data1 = data.drop("Area", axis = 1)    # axis = 1 for column
data1.head()

Unnamed: 0,Area Abbreviation,Area Code,Item Code,Item,Element Code,...,Y2009,Y2010,Y2011,Y2012,Y2013
0,AF,2,2511,Wheat and products,5142,...,4538.0,4605.0,4711.0,4810,4895
1,AF,2,2805,Rice (Milled Equivalent),5142,...,415.0,442.0,476.0,425,422
2,AF,2,2513,Barley and products,5521,...,379.0,315.0,203.0,367,360
3,AF,2,2513,Barley and products,5142,...,55.0,60.0,72.0,78,89
4,AF,2,2514,Maize and products,5521,...,195.0,178.0,191.0,200,200


In [64]:
data2 = data.drop(columns = ['Area', "Area Code"])
data2.head()

Unnamed: 0,Area Abbreviation,Item Code,Item,Element Code,Element,...,Y2009,Y2010,Y2011,Y2012,Y2013
0,AF,2511,Wheat and products,5142,Food,...,4538.0,4605.0,4711.0,4810,4895
1,AF,2805,Rice (Milled Equivalent),5142,Food,...,415.0,442.0,476.0,425,422
2,AF,2513,Barley and products,5521,Feed,...,379.0,315.0,203.0,367,360
3,AF,2513,Barley and products,5142,Food,...,55.0,60.0,72.0,78,89
4,AF,2514,Maize and products,5521,Feed,...,195.0,178.0,191.0,200,200


In [65]:
# data.drop("Area", axis = 1, inplace = True)
# original 'data' object is *CHANGED* when inplace = True

##### Delete Rows

In [66]:
# drop() delete rows based on *INDEX NAME* , not numeric index
data.head(3)

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,...,Y2009,Y2010,Y2011,Y2012,Y2013
0,AF,2,Afghanistan,2511,Wheat and products,...,4538.0,4605.0,4711.0,4810,4895
1,AF,2,Afghanistan,2805,Rice (Milled Equivalent),...,415.0,442.0,476.0,425,422
2,AF,2,Afghanistan,2513,Barley and products,...,379.0,315.0,203.0,367,360


In [67]:
data3 = data.drop([0,1], axis = 0)   # 0 and 1 are "INDEX NAME"
data3.head(3)

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,...,Y2009,Y2010,Y2011,Y2012,Y2013
2,AF,2,Afghanistan,2513,Barley and products,...,379.0,315.0,203.0,367,360
3,AF,2,Afghanistan,2513,Barley and products,...,55.0,60.0,72.0,78,89
4,AF,2,Afghanistan,2514,Maize and products,...,195.0,178.0,191.0,200,200


In [68]:
# inplace = True, axis = 0 also delete from 'original' data object

##### Rename Columns Name

In [69]:
data4 = data.rename(columns = {
    "Area": "place_name",
    "Y2013": "year_2013"
}, inplace = False)
data4.head(3)

Unnamed: 0,Area Abbreviation,Area Code,place_name,Item Code,Item,...,Y2009,Y2010,Y2011,Y2012,year_2013
0,AF,2,Afghanistan,2511,Wheat and products,...,4538.0,4605.0,4711.0,4810,4895
1,AF,2,Afghanistan,2805,Rice (Milled Equivalent),...,415.0,442.0,476.0,425,422
2,AF,2,Afghanistan,2513,Barley and products,...,379.0,315.0,203.0,367,360


In [70]:
# change columns name to lowercase and replace whitespace with '_'
data4 = data.rename(columns = lambda x: x.lower().replace(' ','_'))
data4.head(2)

Unnamed: 0,area_abbreviation,area_code,area,item_code,item,...,y2009,y2010,y2011,y2012,y2013
0,AF,2,Afghanistan,2511,Wheat and products,...,4538.0,4605.0,4711.0,4810,4895
1,AF,2,Afghanistan,2805,Rice (Milled Equivalent),...,415.0,442.0,476.0,425,422


In [71]:
# don't want row index name in output file, hence index = False
data4.to_csv("./data/output_FAO_database.csv", index = False, encoding = 'utf8')


In [72]:
# to debug 
# import xlsxwirter
# data4.to_csv("./data/output_FAO_database.xlsx", sheet_name = "Sheet 1", index = False)


In [73]:
ufo = pd.read_csv('./data/uforeports')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [74]:
ufo.columns

Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')

In [75]:
ufo.rename(columns = {'Colors Reported': 'Colors_Reported', 'Shape Reported': 'Shape_Reported'}, inplace = True)
ufo.head()

Unnamed: 0,City,Colors_Reported,Shape_Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [76]:
ufo_cols = ['city', 'colors reported', 'shape reported', 'state', 'time']
ufo.columns = ufo_cols

In [77]:
ufo.head()

Unnamed: 0,city,colors reported,shape reported,state,time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [78]:
ufo = pd.read_csv('./data/uforeports', names = ufo_cols, header = 0)
ufo.head()

Unnamed: 0,city,colors reported,shape reported,state,time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [79]:
ufo.columns = ufo.columns.str.replace(' ', '_')
ufo.columns

Index(['city', 'colors_reported', 'shape_reported', 'state', 'time'], dtype='object')

##### Select a Pandas Series from DataFrame

In [80]:
ufo.head()

Unnamed: 0,city,colors_reported,shape_reported,state,time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [81]:
type(ufo['city'])

pandas.core.series.Series

In [82]:
ufo.city + " " +ufo.state

0                      Ithaca NY
1                 Willingboro NJ
2                     Holyoke CO
3                     Abilene KS
4        New York Worlds Fair NY
                  ...           
18236              Grant Park IL
18237             Spirit Lake IA
18238             Eagle River WI
18239             Eagle River WI
18240                    Ybor FL
Length: 18241, dtype: object

In [83]:
ufo['location'] = ufo.city + ', ' + ufo.state
ufo.head()

Unnamed: 0,city,colors_reported,shape_reported,state,time,location
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,"Ithaca, NY"
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,"Willingboro, NJ"
2,Holyoke,,OVAL,CO,2/15/1931 14:00,"Holyoke, CO"
3,Abilene,,DISK,KS,6/1/1931 13:00,"Abilene, KS"
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,"New York Worlds Fair, NY"


##### pandas methods and attributes 

In [84]:
movies = pd.read_csv('./data/imdbratings')
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [85]:
movies.describe()

Unnamed: 0,star_rating,duration
count,979.0,979.0
mean,7.889785,120.979571
std,0.336069,26.21801
min,7.4,64.0
25%,7.6,102.0
50%,7.8,117.0
75%,8.1,134.0
max,9.3,242.0


In [86]:
movies.shape

(979, 6)

In [87]:
movies.dtypes

star_rating       float64
title              object
content_rating     object
genre              object
duration            int64
actors_list        object
dtype: object

In [88]:
movies.describe(include = ['object'])

Unnamed: 0,title,content_rating,genre,actors_list
count,979,976,979,979
unique,975,12,16,969
top,Dracula,R,Drama,"[u'Daniel Radcliffe', u'Emma Watson', u'Rupert..."
freq,2,460,278,6


##### Summary Functions

In [89]:
import pandas as pd
pd.set_option('max_rows', 5)
import numpy as np
reviews = pd.read_csv("./data/winemag-data-130k-v2.csv", index_col = 0)
reviews.head()

Unnamed: 0,country,description,designation,points,price,...,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,...,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,...,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,...,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,...,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,...,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [90]:
reviews.price.dtype     # a specific column dtype
reviews.dtypes          # whole df dtypes

country        object
description    object
                ...  
variety        object
winery         object
Length: 13, dtype: object

In [91]:
# numerical data type describe
reviews.points.describe()

count    129971.000000
mean         88.447138
             ...      
75%          91.000000
max         100.000000
Name: points, Length: 8, dtype: float64

In [92]:
# string data type describe
reviews.taster_name.describe()

count         103727
unique            19
top       Roger Voss
freq           25514
Name: taster_name, dtype: object

In [93]:
reviews.info()   # of entries, columns, non-null, memory, columns dtype, etc

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129971 entries, 0 to 129970
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   country                129908 non-null  object 
 1   description            129971 non-null  object 
 2   designation            92506 non-null   object 
 3   points                 129971 non-null  int64  
 4   price                  120975 non-null  float64
 5   province               129908 non-null  object 
 6   region_1               108724 non-null  object 
 7   region_2               50511 non-null   object 
 8   taster_name            103727 non-null  object 
 9   taster_twitter_handle  98758 non-null   object 
 10  title                  129971 non-null  object 
 11  variety                129970 non-null  object 
 12  winery                 129971 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 13.9+ MB


In [94]:
# to see mean of points
reviews.points.mean()

88.44713820775404

In [95]:
# to see a list of unique string values. Returns a ndarray dtype
reviews.taster_name.unique()
#len(reviews.taster_name.unique())


array(['Kerin O’Keefe', 'Roger Voss', 'Paul Gregutt',
       'Alexander Peartree', 'Michael Schachner', 'Anna Lee C. Iijima',
       'Virginie Boone', 'Matt Kettmann', nan, 'Sean P. Sullivan',
       'Jim Gordon', 'Joe Czerwinski', 'Anne Krebiehl\xa0MW',
       'Lauren Buzzeo', 'Mike DeSimone', 'Jeff Jenssen',
       'Susan Kostrzewa', 'Carrie Dykes', 'Fiona Adams',
       'Christina Pickard'], dtype=object)

In [96]:
# to see a list of unique values and how often they occur in the dataset
reviews.taster_name.value_counts()

Roger Voss           25514
Michael Schachner    15134
                     ...  
Fiona Adams             27
Christina Pickard        6
Name: taster_name, Length: 19, dtype: int64

##### map

In [97]:
# demean the points
reviews_points_mean = reviews.points.mean()
reviews.points.map(lambda p: p - reviews_points_mean)  #not inplace


0        -1.447138
1        -1.447138
            ...   
129969    1.552862
129970    1.552862
Name: points, Length: 129971, dtype: float64

In [98]:
#  to debug
def remain_points(row):
    row.points = row.points - reviews_points_mean
    return row

reviews.apply(remain_points, axis = "columns")
reviews

Unnamed: 0,country,description,designation,points,price,...,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,...,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,...,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
...,...,...,...,...,...,...,...,...,...,...,...
129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,...,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss
129970,France,"Big, rich and off-dry, this is powered by inte...",Lieu-dit Harth Cuvée Caroline,90,21.0,...,Roger Voss,@vossroger,Domaine Schoffit 2012 Lieu-dit Harth Cuvée Car...,Gewürztraminer,Domaine Schoffit


In [99]:
reviews_points_mean = reviews.points.mean()
reviews2 = reviews.points - reviews_points_mean
reviews2

0        -1.447138
1        -1.447138
            ...   
129969    1.552862
129970    1.552862
Name: points, Length: 129971, dtype: float64

In [100]:
reviews.country + '-' + reviews.region_1

0            Italy-Etna
1                   NaN
              ...      
129969    France-Alsace
129970    France-Alsace
Length: 129971, dtype: object

##### Groupby

In [101]:
# group by 'points' and count the occurance of each point
reviews.groupby("points").points.count()

points
80     397
81     692
      ... 
99      33
100     19
Name: points, Length: 21, dtype: int64

In [102]:
# to get the cheapest wine in each point value category
reviews.groupby("points").price.min()

points
80      5.0
81      5.0
       ... 
99     44.0
100    80.0
Name: price, Length: 21, dtype: float64

In [103]:
# to select the name of the first wine reivewed from each winery in the dataset
reviews.groupby("winery").apply(lambda df: df.title.iloc[0])

winery
1+1=3                          1+1=3 NV Rosé Sparkling (Cava)
10 Knots                 10 Knots 2010 Viognier (Paso Robles)
                                  ...                        
àMaurice    àMaurice 2013 Fred Estate Syrah (Walla Walla V...
Štoka                         Štoka 2009 Izbrani Teran (Kras)
Length: 16757, dtype: object

In [104]:
reviews.points.idxmin()

344

In [105]:
# to pick out the best wine by country and province
# pd.Series.idxmax() returns the index of a series's max value
reivews.groupby(["country", "province"]).apply(lambda df: df.loc[df.points.idxmax()])

NameError: name 'reivews' is not defined

In [None]:
# agg() lets you run a bunch of different functions on df simultaneously
reviews.groupby("country").price.agg([len, min, max])

##### multi-index

In [None]:
countries_reviewed = reviews.groupby(['country', 'province']).description.agg([len])
countries_reviewed

In [None]:
# to convert back to a regular index
countries_reviewed.reset_index()

In [None]:
countries_reviewed.sort_values(by = 'len', ascending = False)

In [None]:
countries_reviewed = countries_reviewed.reset_index()
countries_reviewed.sort_index()

##### Missing Data

In [None]:
# pd.isnull() and pd.notnull()
reviews[pd.isnull(reviews.country)]

In [None]:
# pd.fillna() to fill NaN 
reviews.region_2.fillna("Unknown")

In [None]:
# pd.replace()
reviews.taster_twitter_handle.replace("@kerinokeefe","@kerino")

In [None]:
melb_data= pd.read_csv("./data/melb_data.csv")
melb_data.head()

In [None]:
# to get the names of columns with missing value
cols_with_missing = [ col for col in melb_data.columns
                    if melb_data[col].isnull().any()]
cols_with_missing

##### 1. Drop columns with missing value

In [None]:
reduced_melb_data = melb_data.drop(cols_with_missing, axis = 1)
reduced_melb_data.shape

##### 2. use SimpleImputer to replace missing values with the mean value along each column

In [None]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()  # default strategy =mean()

melb_predictors = melb_data.drop('Price', axis = 1)      # predictors columns remain
X = melb_predictors.select_dtypes(exclude = 'object')    # only numeric columns remain

# Imputation
imputed_melb_data = pd.DataFrame(my_imputer.fit_transform(X))

# imputation removed column names, put them back
imputed_melb_data.columns = X.columns
imputed_melb_data.head()


##### 3. Impute the missing values, while also keeping track of which values were imputed

In [None]:
# make copy to avoid changing original data (when imputing)
X_plus = X.copy()

cols_with_missing = [ col for col in X.columns
                    if X[col].isnull().any()]

# make new columns indicating what will be imputed
for col in cols_with_missing:
    X_plus[col + '_was_missing'] = X_plus[col].isnull()
    
# Imputation
my_imputer = SimpleImputer()
imputed_X_plus = pd.DataFrame(my_imputer.fit_transform(X_plus))

# Put column names back
imputed_X_plus.columns = X_plus.columns
imputed_X_plus

##### Visualization

In [None]:
import matplotlib.pyplot as plt

data['latitude'].plot(kind = 'hist', bins = 100)
plt.xlabel('Latitude Value')
plt.show()

In [None]:
plot_data = data[data['Element'] == 'Food']
plot_data = plot_data.groupby('Area')['Y2013'].sum()
plot_data.sort_values()[-10:].plot(kind = 'bar')
plt.title("Top Ten Food Producers")
plt.ylabel("Food produced(tonnes)")
plt.show()