### Pandas DataFrame 
#### Create DataFrame *Manually*

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 30)

In [2]:
# Create a DataFrame through a two-dimensional array
df1 = pd.DataFrame([
    ['joe', 'san', 'andy'], 
    [70, 80, 90],
])
df1

Unnamed: 0,0,1,2
0,joe,san,andy
1,70,80,90


In [3]:
# Create a DataFrame with index
df2 = pd.DataFrame([
    ['joe', 'san', 'andy'], 
    [70, 80, 90],
], index = ['first', 'second'],     # index parameter for ROW index
    columns =['a', 'b', 'c'] )      # columns parameter for COLUMN index

df2

Unnamed: 0,a,b,c
first,joe,san,andy
second,70,80,90


In [4]:
df2.index

Index(['first', 'second'], dtype='object')

In [5]:
df2.columns

Index(['a', 'b', 'c'], dtype='object')

In [6]:
df2.values

array([['joe', 'san', 'andy'],
       [70, 80, 90]], dtype=object)

In [7]:
# Create a DataFrame through a dictionary
df3 = pd.DataFrame({
    'name': ['joe', 'san', 'andy'],     # 'key' becomes COLUMN index
    'sex': ['male', 'male', 'female'],
    'age':[10, 20, 30],
    'id': 3,
}, index = ['first', 'second', 'third']) # "index" parameter defines ROW index
df3

Unnamed: 0,name,sex,age,id
first,joe,male,10,3
second,san,male,20,3
third,andy,female,30,3


In [8]:
df3.index

Index(['first', 'second', 'third'], dtype='object')

In [9]:
df3.columns

Index(['name', 'sex', 'age', 'id'], dtype='object')

In [10]:
df3.values

array([['joe', 'male', 10, 3],
       ['san', 'male', 20, 3],
       ['andy', 'female', 30, 3]], dtype=object)

In [11]:
df3['name']

first      joe
second     san
third     andy
Name: name, dtype: object

#### get data and crud operations

In [12]:
# add a column
df3['city'] = ['beijing', 'shanghai', 'chengdu']
df3

Unnamed: 0,name,sex,age,id,city
first,joe,male,10,3,beijing
second,san,male,20,3,shanghai
third,andy,female,30,3,chengdu


In [13]:
# remove a column
df3.pop('city')

first      beijing
second    shanghai
third      chengdu
Name: city, dtype: object

In [14]:
df3

Unnamed: 0,name,sex,age,id
first,joe,male,10,3
second,san,male,20,3
third,andy,female,30,3


In [15]:
# update a column values
df3['age'] = [18,19,20]
df3

Unnamed: 0,name,sex,age,id
first,joe,male,18,3
second,san,male,19,3
third,andy,female,20,3


In [16]:
df3.loc['second']

name     san
sex     male
age       19
id         3
Name: second, dtype: object

In [17]:
df3.loc['second', :]

name     san
sex     male
age       19
id         3
Name: second, dtype: object

In [18]:
df3.loc['second',['age', 'sex']]

age      19
sex    male
Name: second, dtype: object

In [19]:
df3.loc['fourth'] = ['peter', 'female', 22, 4]  # insert a row

In [20]:
df3

Unnamed: 0,name,sex,age,id
first,joe,male,18,3
second,san,male,19,3
third,andy,female,20,3
fourth,peter,female,22,4


In [21]:
df3.drop('fourth')   # remove a row

Unnamed: 0,name,sex,age,id
first,joe,male,18,3
second,san,male,19,3
third,andy,female,20,3


In [22]:
weather_data = {
    'day': ['1/1/2020', '1/2/2020', '1/3/2020', '1/4/2020', '1/5/2020', '1/6/2020'],
    'temperature': [32, 35, 28, 24, 32, 31],
    'windspeed': [6,7,2,7,4,2],
    'event': ['Rain', 'Sunny', 'Snow', 'Snow', 'Rain', 'Windy']
}
df4 = pd.DataFrame(weather_data)
df4.head(6)

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2020,32,6,Rain
1,1/2/2020,35,7,Sunny
2,1/3/2020,28,2,Snow
3,1/4/2020,24,7,Snow
4,1/5/2020,32,4,Rain
5,1/6/2020,31,2,Windy


In [23]:
df4.shape

(6, 4)

In [24]:
df4.tail()

Unnamed: 0,day,temperature,windspeed,event
1,1/2/2020,35,7,Sunny
2,1/3/2020,28,2,Snow
3,1/4/2020,24,7,Snow
4,1/5/2020,32,4,Rain
5,1/6/2020,31,2,Windy


In [25]:
df4.iloc[-1, :]

day            1/6/2020
temperature          31
windspeed             2
event             Windy
Name: 5, dtype: object

In [26]:
df4.columns

Index(['day', 'temperature', 'windspeed', 'event'], dtype='object')

In [27]:
df4.event

0     Rain
1    Sunny
2     Snow
3     Snow
4     Rain
5    Windy
Name: event, dtype: object

In [28]:
df4.dtypes

day            object
temperature     int64
windspeed       int64
event          object
dtype: object

In [29]:
df4 = df4[['day', 'event', 'windspeed', 'temperature']]
df4.head()

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,7,35
2,1/3/2020,Snow,2,28
3,1/4/2020,Snow,7,24
4,1/5/2020,Rain,4,32


In [30]:
print(df4.describe())

       windspeed  temperature
count   6.000000     6.000000
mean    4.666667    30.333333
std     2.338090     3.829708
min     2.000000    24.000000
25%     2.500000    28.750000
50%     5.000000    31.500000
75%     6.750000    32.000000
max     7.000000    35.000000


In [31]:
# select the latest day's data
df4.loc[df4.day == df4.day.max(), :]

Unnamed: 0,day,event,windspeed,temperature
5,1/6/2020,Windy,2,31


In [32]:
# select the earliest day's data
df4.loc[df4.day == df4.day.min(), :]

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32


In [33]:
# select the pandas data according to condition of any column
df4.loc[df4.temperature == df4.temperature.max(), :]

Unnamed: 0,day,event,windspeed,temperature
1,1/2/2020,Sunny,7,35


In [34]:
df4.loc[df4.temperature >= 32, :]

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,7,35
4,1/5/2020,Rain,4,32


In [35]:
# not better than the above solution
df4[df4.temperature >= 32]

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,7,35
4,1/5/2020,Rain,4,32


In [36]:
df4[df4.temperature == df4.temperature.max()]

Unnamed: 0,day,event,windspeed,temperature
1,1/2/2020,Sunny,7,35


In [37]:
df4.loc[df4['temperature'] == df4['temperature'].max(), ['day', 'windspeed', 'event']]

Unnamed: 0,day,windspeed,event
1,1/2/2020,7,Sunny


In [38]:
# update the hottest day's windspeed to 4
df4.loc[df4.temperature == df4.temperature.max(), 'windspeed'] = 4
df4

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,4,35
2,1/3/2020,Snow,2,28
3,1/4/2020,Snow,7,24
4,1/5/2020,Rain,4,32
5,1/6/2020,Windy,2,31


### Index

In [39]:
data = pd.Series(['a','b', 'c'], index = [1,3,5])
print(data[5])      #explicit index, use loc instead
print(data[1:3])    #implicit index, use iloc instead


c
3    b
5    c
dtype: object


In [40]:
df4

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,4,35
2,1/3/2020,Snow,2,28
3,1/4/2020,Snow,7,24
4,1/5/2020,Rain,4,32
5,1/6/2020,Windy,2,31


In [41]:
df4.set_index('day', inplace = True)
df4

Unnamed: 0_level_0,event,windspeed,temperature
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2020,Rain,6,32
1/2/2020,Sunny,4,35
1/3/2020,Snow,2,28
1/4/2020,Snow,7,24
1/5/2020,Rain,4,32
1/6/2020,Windy,2,31


In [42]:
df4.loc['1/4/2020', :]

event          Snow
windspeed         7
temperature      24
Name: 1/4/2020, dtype: object

In [43]:
df4.reset_index(inplace = True)
df4

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,4,35
2,1/3/2020,Snow,2,28
3,1/4/2020,Snow,7,24
4,1/5/2020,Rain,4,32
5,1/6/2020,Windy,2,31


In [44]:
df4.set_index('event', inplace = True)
df4

Unnamed: 0_level_0,day,windspeed,temperature
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rain,1/1/2020,6,32
Sunny,1/2/2020,4,35
Snow,1/3/2020,2,28
Snow,1/4/2020,7,24
Rain,1/5/2020,4,32
Windy,1/6/2020,2,31


In [45]:
df4.loc['Snow', :]

Unnamed: 0_level_0,day,windspeed,temperature
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Snow,1/3/2020,2,28
Snow,1/4/2020,7,24


In [46]:
df4.reset_index(inplace = True)
df4

Unnamed: 0,event,day,windspeed,temperature
0,Rain,1/1/2020,6,32
1,Sunny,1/2/2020,4,35
2,Snow,1/3/2020,2,28
3,Snow,1/4/2020,7,24
4,Rain,1/5/2020,4,32
5,Windy,1/6/2020,2,31


In [47]:
drinks = pd.read_csv('./data/drinksbycountry')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [48]:
drinks.index

RangeIndex(start=0, stop=193, step=1)

In [49]:
drinks.columns

Index(['country', 'beer_servings', 'spirit_servings', 'wine_servings',
       'total_litres_of_pure_alcohol', 'continent'],
      dtype='object')

#### Index for identification

In [50]:
drinks[drinks.continent == "South America"]

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
6,Argentina,193,25,221,8.3,South America
20,Bolivia,167,41,8,3.8,South America
23,Brazil,245,145,16,7.2,South America
35,Chile,130,124,172,7.6,South America
37,Colombia,159,76,3,4.2,South America
52,Ecuador,162,74,3,4.2,South America
72,Guyana,93,302,1,7.1,South America
132,Paraguay,213,117,74,7.3,South America
133,Peru,163,160,21,6.1,South America
163,Suriname,128,178,7,5.6,South America


#### Index for selection

In [51]:
drinks.loc[20, 'beer_servings']

167

In [52]:
drinks.set_index('country', inplace = True)
drinks.head()

Unnamed: 0_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,0,0,0,0.0,Asia
Albania,89,132,54,4.9,Europe
Algeria,25,0,14,0.7,Africa
Andorra,245,138,312,12.4,Europe
Angola,217,57,45,5.9,Africa


In [53]:
drinks.index

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'Tanzania', 'USA', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela',
       'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', name='country', length=193)

In [54]:
drinks.loc['Brazil', 'beer_servings']

245

In [55]:
drinks.index.name = None
drinks.head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
Afghanistan,0,0,0,0.0,Asia
Albania,89,132,54,4.9,Europe
Algeria,25,0,14,0.7,Africa
Andorra,245,138,312,12.4,Europe
Angola,217,57,45,5.9,Africa


In [56]:
drinks.index.name = 'country'
drinks.reset_index(inplace = True)
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [57]:
drinks.describe()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
count,193.0,193.0,193.0,193.0
mean,106.160622,80.994819,49.450777,4.717098
std,101.143103,88.284312,79.697598,3.773298
min,0.0,0.0,0.0,0.0
25%,20.0,4.0,1.0,1.3
50%,76.0,56.0,8.0,4.2
75%,188.0,128.0,59.0,7.2
max,376.0,438.0,370.0,14.4


In [58]:
drinks.describe().index

Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], dtype='object')

In [59]:
drinks.describe().loc['25%', 'beer_servings']

20.0

In [60]:
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: object

In [61]:
drinks.reset_index(inplace= True)
drinks.head()

Unnamed: 0,index,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,0,Afghanistan,0,0,0,0.0,Asia
1,1,Albania,89,132,54,4.9,Europe
2,2,Algeria,25,0,14,0.7,Africa
3,3,Andorra,245,138,312,12.4,Europe
4,4,Angola,217,57,45,5.9,Africa


In [62]:
drinks.set_index('country', inplace = True)
drinks.head()

Unnamed: 0_level_0,index,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,0,0,0,0,0.0,Asia
Albania,1,89,132,54,4.9,Europe
Algeria,2,25,0,14,0.7,Africa
Andorra,3,245,138,312,12.4,Europe
Angola,4,217,57,45,5.9,Africa


In [63]:
drinks.continent.head()

country
Afghanistan      Asia
Albania        Europe
Algeria        Africa
Andorra        Europe
Angola         Africa
Name: continent, dtype: object

In [64]:
drinks.continent.value_counts()

Africa           53
Europe           45
Asia             44
North America    23
Oceania          16
South America    12
Name: continent, dtype: int64

In [164]:
type(drinks.continent.value_counts())

pandas.core.series.Series

In [65]:
drinks.continent.value_counts().values

array([53, 45, 44, 23, 16, 12], dtype=int64)

In [165]:
drinks.continent.value_counts().index

CategoricalIndex(['Africa', 'Europe', 'Asia', 'North America', 'Oceania',
                  'South America'],
                 categories=['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America'], ordered=False, dtype='category')

In [66]:
drinks.continent.value_counts()['Asia']

44

In [67]:
drinks.continent.value_counts().sort_values()

South America    12
Oceania          16
North America    23
Asia             44
Europe           45
Africa           53
Name: continent, dtype: int64

In [68]:
drinks.continent.value_counts().sort_index()

Africa           53
Asia             44
Europe           45
North America    23
Oceania          16
South America    12
Name: continent, dtype: int64

####  Index for alignment

In [69]:
people = pd.Series([3000000, 85000], index = ['Albania', 'Andorra'], name = 'population')
people

Albania    3000000
Andorra      85000
Name: population, dtype: int64

In [70]:
drinks.beer_servings * people

Afghanistan            NaN
Albania        267000000.0
Algeria                NaN
Andorra         20825000.0
Angola                 NaN
                  ...     
Venezuela              NaN
Vietnam                NaN
Yemen                  NaN
Zambia                 NaN
Zimbabwe               NaN
Length: 193, dtype: float64

#### Concat two DataFrames

In [71]:
pd.concat([drinks, people], axis = 1).head()

Unnamed: 0,index,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent,population
Afghanistan,0,0,0,0,0.0,Asia,
Albania,1,89,132,54,4.9,Europe,3000000.0
Algeria,2,25,0,14,0.7,Africa,
Andorra,3,245,138,312,12.4,Europe,85000.0
Angola,4,217,57,45,5.9,Africa,


In [166]:
drinks.head()

Unnamed: 0,country,index,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0,0.0,Asia
1,Albania,1,89,132,54,4.9,Europe
2,Algeria,2,25,0,14,0.7,Africa
3,Andorra,3,245,138,312,12.4,Europe
4,Angola,4,217,57,45,5.9,Africa


# Category Type

In [72]:
drinks.reset_index(inplace = True)
drinks.head()

Unnamed: 0,country,index,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0,0.0,Asia
1,Albania,1,89,132,54,4.9,Europe
2,Algeria,2,25,0,14,0.7,Africa
3,Andorra,3,245,138,312,12.4,Europe
4,Angola,4,217,57,45,5.9,Africa


In [73]:
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   index                         193 non-null    int64  
 2   beer_servings                 193 non-null    int64  
 3   spirit_servings               193 non-null    int64  
 4   wine_servings                 193 non-null    int64  
 5   total_litres_of_pure_alcohol  193 non-null    float64
 6   continent                     193 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 10.7+ KB


In [74]:
drinks.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   index                         193 non-null    int64  
 2   beer_servings                 193 non-null    int64  
 3   spirit_servings               193 non-null    int64  
 4   wine_servings                 193 non-null    int64  
 5   total_litres_of_pure_alcohol  193 non-null    float64
 6   continent                     193 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 32.0 KB


In [75]:
drinks.memory_usage(deep = True)    # in bytes

Index                             128
country                         12588
index                            1544
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

In [76]:
drinks.memory_usage(deep = True).sum()

32768

In [77]:
sorted(drinks.continent.unique())

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [78]:
drinks['continent'] = drinks.continent.astype('category')
drinks.dtypes

country                           object
index                              int64
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [79]:
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: category
Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]

In [80]:
drinks.memory_usage(deep = True)

Index                             128
country                         12588
index                            1544
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [81]:
drinks.continent.cat.codes.head()

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [82]:
drinks['country']= drinks.country.astype('category')
drinks.dtypes

country                         category
index                              int64
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [83]:
drinks.memory_usage(deep = True)    # country column use more memory. why? because there are still 193 countries

Index                             128
country                         18094
index                            1544
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

#### Sort accroding to category customed order

Example 1

In [84]:
df = pd.DataFrame({'ID': [100, 101, 102, 103], 'quality': ['good', 'very good', 'good', 'excellent']})
df.head()

Unnamed: 0,ID,quality
0,100,good
1,101,very good
2,102,good
3,103,excellent


In [85]:
pd.__version__

'1.0.3'

In [86]:
df.sort_values('quality')

Unnamed: 0,ID,quality
3,103,excellent
0,100,good
2,102,good
1,101,very good


In [87]:
# deprecated
# df.quality.astype('category', categories=['good', 'very good', 'excellent'], ordered = True)
# df.head()

In [88]:
# new way to create an ordered category
from pandas.api.types import CategoricalDtype
quality_cat = CategoricalDtype(['good', 'very good', 'excellent'], ordered = True)
df['quality'] = df.quality.astype(quality_cat)
df.quality

0         good
1    very good
2         good
3    excellent
Name: quality, dtype: category
Categories (3, object): [good < very good < excellent]

In [89]:
df.sort_values('quality')

Unnamed: 0,ID,quality
0,100,good
2,102,good
1,101,very good
3,103,excellent


In [90]:
df.loc[df.quality > 'good']

Unnamed: 0,ID,quality
1,101,very good
3,103,excellent


Example 2

In [168]:
s = pd.Series({'a': 1, 'b': 2, 'c': 3})
s

a    1
b    2
c    3
dtype: int64

In [169]:
order_custom = ['b', 'c', 'a']
order_custom

['b', 'c', 'a']

In [179]:
df = pd.DataFrame(s)
df['id'] = df.index
df.columns = ['val', 'id']
df = df.reset_index()
df.drop(['index'], axis = 1, inplace = True)
df = df.loc[:,['id', 'val']] # swap lists
df

Unnamed: 0,id,val
0,a,1
1,b,2
2,c,3


In [180]:
df['id'] = df['id'].astype('category')
df.dtypes

id     category
val       int64
dtype: object

In [181]:
df['id'].cat.reorder_categories(order_custom, inplace = True)
df.sort_values('id', inplace = True)
df

Unnamed: 0,id,val
1,b,2
2,c,3
0,a,1


##### Import Data from CSV

In [91]:
filepath = "./data/FAO_database.csv"
data = pd.read_csv(filepath, encoding = "ISO-8859-1")
type(data)

pandas.core.frame.DataFrame

In [92]:
# preview the data
data.head()


Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,...,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,AF,2,Afghanistan,2511,Wheat and products,5142,Food,1000 tonnes,33.94,67.71,1928.0,1904.0,1666.0,1950.0,2001.0,...,2463.0,2600.0,2668.0,2776.0,3095.0,3249.0,3486.0,3704.0,4164.0,4252.0,4538.0,4605.0,4711.0,4810,4895
1,AF,2,Afghanistan,2805,Rice (Milled Equivalent),5142,Food,1000 tonnes,33.94,67.71,183.0,183.0,182.0,220.0,220.0,...,270.0,372.0,411.0,448.0,460.0,419.0,445.0,546.0,455.0,490.0,415.0,442.0,476.0,425,422
2,AF,2,Afghanistan,2513,Barley and products,5521,Feed,1000 tonnes,33.94,67.71,76.0,76.0,76.0,76.0,76.0,...,43.0,26.0,29.0,70.0,48.0,58.0,236.0,262.0,263.0,230.0,379.0,315.0,203.0,367,360
3,AF,2,Afghanistan,2513,Barley and products,5142,Food,1000 tonnes,33.94,67.71,237.0,237.0,237.0,238.0,238.0,...,141.0,84.0,83.0,122.0,144.0,185.0,43.0,44.0,48.0,62.0,55.0,60.0,72.0,78,89
4,AF,2,Afghanistan,2514,Maize and products,5521,Feed,1000 tonnes,33.94,67.71,210.0,210.0,214.0,216.0,216.0,...,72.0,35.0,48.0,89.0,63.0,120.0,208.0,233.0,249.0,247.0,195.0,178.0,191.0,200,200


In [93]:
data.shape

(21477, 63)

In [94]:
data.ndim

2

In [95]:
# show columns data type
data.dtypes

Area Abbreviation     object
Area Code              int64
Area                  object
Item Code              int64
Item                  object
                      ...   
Y2009                float64
Y2010                float64
Y2011                float64
Y2012                  int64
Y2013                  int64
Length: 63, dtype: object

In [96]:
# change column 'Item Code' to str type
data['Item Code'].astype(str)

0        2511
1        2805
2        2513
3        2513
4        2514
         ... 
21472    2948
21473    2960
21474    2960
21475    2961
21476    2928
Name: Item Code, Length: 21477, dtype: object

In [97]:
# show summary information
data['Y2013'].describe()   # float64 summary information

count     21477.000000
mean        575.557480
std        6218.379479
min        -246.000000
25%           0.000000
50%           8.000000
75%          90.000000
max      489299.000000
Name: Y2013, dtype: float64

In [98]:
data['Area'].describe()   # obj summary information

count     21477
unique      174
top       Spain
freq        150
Name: Area, dtype: object

In [99]:
data.describe()   # return numeric columns' summary info

Unnamed: 0,Area Code,Item Code,Element Code,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,Y1968,Y1969,Y1970,...,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
count,21477.0,21477.0,21477.0,21477.0,21477.0,17938.0,17938.0,17938.0,17938.0,17938.0,17938.0,17938.0,17938.0,17938.0,17938.0,...,20865.0,21128.0,21128.0,21128.0,21128.0,21128.0,21128.0,21373.0,21373.0,21373.0,21373.0,21373.0,21373.0,21477.0,21477.0
mean,125.449411,2694.211529,5211.687154,20.450613,15.794445,195.262069,200.78225,205.4646,209.925577,217.556751,225.988962,230.419222,238.415487,244.340952,250.262237,...,441.677019,451.771819,458.723826,465.458964,472.693298,486.690742,493.153256,496.319328,508.482104,522.844898,524.581996,535.492069,553.399242,560.569214,575.55748
std,72.868149,148.973406,146.820079,24.628336,66.012104,1864.124336,1884.265591,1861.174739,1862.000116,2014.934333,2100.228354,2132.244717,2189.166883,2266.964627,2322.967733,...,4340.53166,4649.579544,4751.597094,4868.625666,4911.216237,5001.782008,5100.057036,5134.819373,5298.939807,5496.697513,5545.939303,5721.089425,5883.071604,6047.950804,6218.379479
min,1.0,2511.0,5142.0,-40.9,-172.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-169.0,-246.0
25%,63.0,2561.0,5142.0,6.43,-11.78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,120.0,2640.0,5142.0,20.59,19.15,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,...,5.0,5.0,6.0,6.0,6.0,6.0,6.0,7.0,7.0,7.0,7.0,7.0,8.0,8.0,8.0
75%,188.0,2782.0,5142.0,41.15,46.87,21.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0,30.0,...,65.0,67.0,68.0,70.0,71.0,75.0,77.0,78.0,80.0,82.0,83.0,83.0,86.0,88.0,90.0
max,276.0,2961.0,5521.0,64.96,179.41,112227.0,109130.0,106356.0,104234.0,119378.0,118495.0,118725.0,127512.0,134937.0,131871.0,...,255625.0,311110.0,327370.0,352172.0,354850.0,360767.0,373694.0,388100.0,402975.0,425537.0,434724.0,451838.0,462696.0,479028.0,489299.0


##### Select Columns
1. data.column_name
2. data['column_name']
3. data.iloc[:,<column_number>]
4. data.loc[:, ['comumn_name', ...]]

In [100]:
# data.column_name
data.Area.head()

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: Area, dtype: object

In [101]:
# data['column_name']
data[['Area', 'Item']].head()

Unnamed: 0,Area,Item
0,Afghanistan,Wheat and products
1,Afghanistan,Rice (Milled Equivalent)
2,Afghanistan,Barley and products
3,Afghanistan,Barley and products
4,Afghanistan,Maize and products


In [102]:
# data['column_name'] get a Series, then can Series operation
ret = [data['Y2013'].sum(),data['Y2013'].mean(),data['Y2013'].median(),
      data['Y2013'].nunique(),data['Y2013'].max(),data['Y2013'].min(),]
ret

[12361248, 575.5574800949853, 8.0, 2107, 489299, -246]

In [103]:
# data.iloc[...]
data.iloc[:,[0,1,3,5]]

Unnamed: 0,Area Abbreviation,Area Code,Item Code,Element Code
0,AF,2,2511,5142
1,AF,2,2805,5142
2,AF,2,2513,5521
3,AF,2,2513,5142
4,AF,2,2514,5521
...,...,...,...,...
21472,ZW,181,2948,5142
21473,ZW,181,2960,5521
21474,ZW,181,2960,5142
21475,ZW,181,2961,5142


In [104]:
data.iloc[:,[1,2]].head()

Unnamed: 0,Area Code,Area
0,2,Afghanistan
1,2,Afghanistan
2,2,Afghanistan
3,2,Afghanistan
4,2,Afghanistan


##### Select Rows
1. data.iloc[0:10, :]
2. data.loc['index', :]
3. conditional selection

In [105]:
# data.iloc[,]
data.iloc[[1,2],:].head()

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,...,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
1,AF,2,Afghanistan,2805,Rice (Milled Equivalent),5142,Food,1000 tonnes,33.94,67.71,183.0,183.0,182.0,220.0,220.0,...,270.0,372.0,411.0,448.0,460.0,419.0,445.0,546.0,455.0,490.0,415.0,442.0,476.0,425,422
2,AF,2,Afghanistan,2513,Barley and products,5521,Feed,1000 tonnes,33.94,67.71,76.0,76.0,76.0,76.0,76.0,...,43.0,26.0,29.0,70.0,48.0,58.0,236.0,262.0,263.0,230.0,379.0,315.0,203.0,367,360


In [106]:
# data.loc['index', ]
data.loc[2,:]        # select the row with index == 2

Area Abbreviation                     AF
Area Code                              2
Area                         Afghanistan
Item Code                           2513
Item                 Barley and products
                            ...         
Y2009                                379
Y2010                                315
Y2011                                203
Y2012                                367
Y2013                                360
Name: 2, Length: 63, dtype: object

In [107]:
# conditional selection
data[ data['Area'] == 'Ireland'].head()

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,...,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
9533,IE,104,Ireland,2511,Wheat and products,5521,Feed,1000 tonnes,53.41,-8.24,132.0,95.0,142.0,78.0,134.0,...,811.0,686.0,720.0,1029.0,872.0,968.0,976.0,902.0,685.0,1063.0,804.0,783.0,760.0,650,600
9534,IE,104,Ireland,2511,Wheat and products,5142,Food,1000 tonnes,53.41,-8.24,389.0,381.0,379.0,365.0,364.0,...,375.0,382.0,392.0,381.0,397.0,395.0,423.0,501.0,449.0,470.0,493.0,512.0,502.0,494,500
9535,IE,104,Ireland,2805,Rice (Milled Equivalent),5521,Feed,1000 tonnes,53.41,-8.24,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,3.0,5.0,4.0,3.0,3.0,3.0,4.0,5.0,4.0,4.0,4.0,4,4
9536,IE,104,Ireland,2805,Rice (Milled Equivalent),5142,Food,1000 tonnes,53.41,-8.24,2.0,3.0,2.0,2.0,3.0,...,11.0,9.0,10.0,8.0,8.0,11.0,6.0,6.0,9.0,14.0,15.0,16.0,14.0,14,14
9537,IE,104,Ireland,2513,Barley and products,5521,Feed,1000 tonnes,53.41,-8.24,322.0,393.0,339.0,400.0,429.0,...,851.0,840.0,830.0,679.0,864.0,993.0,908.0,1047.0,904.0,1242.0,1290.0,1283.0,1182.0,1146,1380


In [108]:
# the following two have the same results
data['latitude'] >= 0
data.loc[:, 'latitude'] >= 0

0         True
1         True
2         True
3         True
4         True
         ...  
21472    False
21473    False
21474    False
21475    False
21476    False
Name: latitude, Length: 21477, dtype: bool

In [109]:
# same results
data[data['latitude'] >= 0]
data[data.loc[:,'latitude'] >= 0]

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,...,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,AF,2,Afghanistan,2511,Wheat and products,5142,Food,1000 tonnes,33.94,67.71,1928.0,1904.0,1666.0,1950.0,2001.0,...,2463.0,2600.0,2668.0,2776.0,3095.0,3249.0,3486.0,3704.0,4164.0,4252.0,4538.0,4605.0,4711.0,4810,4895
1,AF,2,Afghanistan,2805,Rice (Milled Equivalent),5142,Food,1000 tonnes,33.94,67.71,183.0,183.0,182.0,220.0,220.0,...,270.0,372.0,411.0,448.0,460.0,419.0,445.0,546.0,455.0,490.0,415.0,442.0,476.0,425,422
2,AF,2,Afghanistan,2513,Barley and products,5521,Feed,1000 tonnes,33.94,67.71,76.0,76.0,76.0,76.0,76.0,...,43.0,26.0,29.0,70.0,48.0,58.0,236.0,262.0,263.0,230.0,379.0,315.0,203.0,367,360
3,AF,2,Afghanistan,2513,Barley and products,5142,Food,1000 tonnes,33.94,67.71,237.0,237.0,237.0,238.0,238.0,...,141.0,84.0,83.0,122.0,144.0,185.0,43.0,44.0,48.0,62.0,55.0,60.0,72.0,78,89
4,AF,2,Afghanistan,2514,Maize and products,5521,Feed,1000 tonnes,33.94,67.71,210.0,210.0,214.0,216.0,216.0,...,72.0,35.0,48.0,89.0,63.0,120.0,208.0,233.0,249.0,247.0,195.0,178.0,191.0,200,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21231,YE,249,Yemen,2948,Milk - Excluding Butter,5142,Food,1000 tonnes,15.55,48.52,133.0,138.0,141.0,149.0,160.0,...,527.0,589.0,585.0,595.0,746.0,726.0,760.0,760.0,849.0,812.0,984.0,738.0,854.0,1050,1073
21232,YE,249,Yemen,2960,"Fish, Seafood",5521,Feed,1000 tonnes,15.55,48.52,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
21233,YE,249,Yemen,2960,"Fish, Seafood",5142,Food,1000 tonnes,15.55,48.52,13.0,12.0,13.0,17.0,15.0,...,98.0,86.0,91.0,118.0,160.0,181.0,149.0,135.0,80.0,59.0,78.0,69.0,60.0,60,60
21234,YE,249,Yemen,2961,"Aquatic Products, Other",5142,Food,1000 tonnes,15.55,48.52,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


In [110]:
data.loc[data['latitude'] >= 0, 'Area':]

Unnamed: 0,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,...,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,Afghanistan,2511,Wheat and products,5142,Food,1000 tonnes,33.94,67.71,1928.0,1904.0,1666.0,1950.0,2001.0,1808.0,2053.0,...,2463.0,2600.0,2668.0,2776.0,3095.0,3249.0,3486.0,3704.0,4164.0,4252.0,4538.0,4605.0,4711.0,4810,4895
1,Afghanistan,2805,Rice (Milled Equivalent),5142,Food,1000 tonnes,33.94,67.71,183.0,183.0,182.0,220.0,220.0,195.0,231.0,...,270.0,372.0,411.0,448.0,460.0,419.0,445.0,546.0,455.0,490.0,415.0,442.0,476.0,425,422
2,Afghanistan,2513,Barley and products,5521,Feed,1000 tonnes,33.94,67.71,76.0,76.0,76.0,76.0,76.0,75.0,71.0,...,43.0,26.0,29.0,70.0,48.0,58.0,236.0,262.0,263.0,230.0,379.0,315.0,203.0,367,360
3,Afghanistan,2513,Barley and products,5142,Food,1000 tonnes,33.94,67.71,237.0,237.0,237.0,238.0,238.0,237.0,225.0,...,141.0,84.0,83.0,122.0,144.0,185.0,43.0,44.0,48.0,62.0,55.0,60.0,72.0,78,89
4,Afghanistan,2514,Maize and products,5521,Feed,1000 tonnes,33.94,67.71,210.0,210.0,214.0,216.0,216.0,216.0,235.0,...,72.0,35.0,48.0,89.0,63.0,120.0,208.0,233.0,249.0,247.0,195.0,178.0,191.0,200,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21231,Yemen,2948,Milk - Excluding Butter,5142,Food,1000 tonnes,15.55,48.52,133.0,138.0,141.0,149.0,160.0,155.0,169.0,...,527.0,589.0,585.0,595.0,746.0,726.0,760.0,760.0,849.0,812.0,984.0,738.0,854.0,1050,1073
21232,Yemen,2960,"Fish, Seafood",5521,Feed,1000 tonnes,15.55,48.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
21233,Yemen,2960,"Fish, Seafood",5142,Food,1000 tonnes,15.55,48.52,13.0,12.0,13.0,17.0,15.0,14.0,16.0,...,98.0,86.0,91.0,118.0,160.0,181.0,149.0,135.0,80.0,59.0,78.0,69.0,60.0,60,60
21234,Yemen,2961,"Aquatic Products, Other",5142,Food,1000 tonnes,15.55,48.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


##### Iterate DataFrame

In [111]:
for index, row in data.iterrows():
    print(row[0])

AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AF
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
AL
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
DZ
AO
AO
AO
A

CF
CF
CF
CF
CF
CF
CF
CF
CF
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
TD
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
CL
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
HK
H

DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
DE
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GH
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
GR
G

LT
LT
LT
LT
LT
LT
LT
LT
LT
LT
LT
LT
LT
LT
LT
LT
LT
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
LU
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MG
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
MW
M

PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
PT
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
KR
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
MD
M

TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TR
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
TM
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UG
UA
UA
UA
UA
UA
UA
UA
UA
UA
UA
UA
UA
UA
UA
UA
UA
UA
UA
U

##### Delete columns

In [112]:
data1 = data.drop("Area", axis = 1)    # axis = 1 for column
data1.head()

Unnamed: 0,Area Abbreviation,Area Code,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,...,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,AF,2,2511,Wheat and products,5142,Food,1000 tonnes,33.94,67.71,1928.0,1904.0,1666.0,1950.0,2001.0,1808.0,...,2463.0,2600.0,2668.0,2776.0,3095.0,3249.0,3486.0,3704.0,4164.0,4252.0,4538.0,4605.0,4711.0,4810,4895
1,AF,2,2805,Rice (Milled Equivalent),5142,Food,1000 tonnes,33.94,67.71,183.0,183.0,182.0,220.0,220.0,195.0,...,270.0,372.0,411.0,448.0,460.0,419.0,445.0,546.0,455.0,490.0,415.0,442.0,476.0,425,422
2,AF,2,2513,Barley and products,5521,Feed,1000 tonnes,33.94,67.71,76.0,76.0,76.0,76.0,76.0,75.0,...,43.0,26.0,29.0,70.0,48.0,58.0,236.0,262.0,263.0,230.0,379.0,315.0,203.0,367,360
3,AF,2,2513,Barley and products,5142,Food,1000 tonnes,33.94,67.71,237.0,237.0,237.0,238.0,238.0,237.0,...,141.0,84.0,83.0,122.0,144.0,185.0,43.0,44.0,48.0,62.0,55.0,60.0,72.0,78,89
4,AF,2,2514,Maize and products,5521,Feed,1000 tonnes,33.94,67.71,210.0,210.0,214.0,216.0,216.0,216.0,...,72.0,35.0,48.0,89.0,63.0,120.0,208.0,233.0,249.0,247.0,195.0,178.0,191.0,200,200


In [113]:
data2 = data.drop(columns = ['Area', "Area Code"])
data2.head()

Unnamed: 0,Area Abbreviation,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,...,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,AF,2511,Wheat and products,5142,Food,1000 tonnes,33.94,67.71,1928.0,1904.0,1666.0,1950.0,2001.0,1808.0,2053.0,...,2463.0,2600.0,2668.0,2776.0,3095.0,3249.0,3486.0,3704.0,4164.0,4252.0,4538.0,4605.0,4711.0,4810,4895
1,AF,2805,Rice (Milled Equivalent),5142,Food,1000 tonnes,33.94,67.71,183.0,183.0,182.0,220.0,220.0,195.0,231.0,...,270.0,372.0,411.0,448.0,460.0,419.0,445.0,546.0,455.0,490.0,415.0,442.0,476.0,425,422
2,AF,2513,Barley and products,5521,Feed,1000 tonnes,33.94,67.71,76.0,76.0,76.0,76.0,76.0,75.0,71.0,...,43.0,26.0,29.0,70.0,48.0,58.0,236.0,262.0,263.0,230.0,379.0,315.0,203.0,367,360
3,AF,2513,Barley and products,5142,Food,1000 tonnes,33.94,67.71,237.0,237.0,237.0,238.0,238.0,237.0,225.0,...,141.0,84.0,83.0,122.0,144.0,185.0,43.0,44.0,48.0,62.0,55.0,60.0,72.0,78,89
4,AF,2514,Maize and products,5521,Feed,1000 tonnes,33.94,67.71,210.0,210.0,214.0,216.0,216.0,216.0,235.0,...,72.0,35.0,48.0,89.0,63.0,120.0,208.0,233.0,249.0,247.0,195.0,178.0,191.0,200,200


In [114]:
# data.drop("Area", axis = 1, inplace = True)
# original 'data' object is *CHANGED* when inplace = True

##### Delete Rows

In [115]:
# drop() delete rows based on *INDEX NAME* , not numeric index
data.head(3)

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,...,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,AF,2,Afghanistan,2511,Wheat and products,5142,Food,1000 tonnes,33.94,67.71,1928.0,1904.0,1666.0,1950.0,2001.0,...,2463.0,2600.0,2668.0,2776.0,3095.0,3249.0,3486.0,3704.0,4164.0,4252.0,4538.0,4605.0,4711.0,4810,4895
1,AF,2,Afghanistan,2805,Rice (Milled Equivalent),5142,Food,1000 tonnes,33.94,67.71,183.0,183.0,182.0,220.0,220.0,...,270.0,372.0,411.0,448.0,460.0,419.0,445.0,546.0,455.0,490.0,415.0,442.0,476.0,425,422
2,AF,2,Afghanistan,2513,Barley and products,5521,Feed,1000 tonnes,33.94,67.71,76.0,76.0,76.0,76.0,76.0,...,43.0,26.0,29.0,70.0,48.0,58.0,236.0,262.0,263.0,230.0,379.0,315.0,203.0,367,360


In [116]:
data3 = data.drop([0,1], axis = 0)   # 0 and 1 are "INDEX NAME"
data3.head(3)

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,...,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
2,AF,2,Afghanistan,2513,Barley and products,5521,Feed,1000 tonnes,33.94,67.71,76.0,76.0,76.0,76.0,76.0,...,43.0,26.0,29.0,70.0,48.0,58.0,236.0,262.0,263.0,230.0,379.0,315.0,203.0,367,360
3,AF,2,Afghanistan,2513,Barley and products,5142,Food,1000 tonnes,33.94,67.71,237.0,237.0,237.0,238.0,238.0,...,141.0,84.0,83.0,122.0,144.0,185.0,43.0,44.0,48.0,62.0,55.0,60.0,72.0,78,89
4,AF,2,Afghanistan,2514,Maize and products,5521,Feed,1000 tonnes,33.94,67.71,210.0,210.0,214.0,216.0,216.0,...,72.0,35.0,48.0,89.0,63.0,120.0,208.0,233.0,249.0,247.0,195.0,178.0,191.0,200,200


In [117]:
# inplace = True, axis = 0 also delete from 'original' data object

##### Rename Columns Name

In [118]:
data4 = data.rename(columns = {
    "Area": "place_name",
    "Y2013": "year_2013"
}, inplace = False)
data4.head(3)

Unnamed: 0,Area Abbreviation,Area Code,place_name,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,...,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,year_2013
0,AF,2,Afghanistan,2511,Wheat and products,5142,Food,1000 tonnes,33.94,67.71,1928.0,1904.0,1666.0,1950.0,2001.0,...,2463.0,2600.0,2668.0,2776.0,3095.0,3249.0,3486.0,3704.0,4164.0,4252.0,4538.0,4605.0,4711.0,4810,4895
1,AF,2,Afghanistan,2805,Rice (Milled Equivalent),5142,Food,1000 tonnes,33.94,67.71,183.0,183.0,182.0,220.0,220.0,...,270.0,372.0,411.0,448.0,460.0,419.0,445.0,546.0,455.0,490.0,415.0,442.0,476.0,425,422
2,AF,2,Afghanistan,2513,Barley and products,5521,Feed,1000 tonnes,33.94,67.71,76.0,76.0,76.0,76.0,76.0,...,43.0,26.0,29.0,70.0,48.0,58.0,236.0,262.0,263.0,230.0,379.0,315.0,203.0,367,360


In [119]:
# change columns name to lowercase and replace whitespace with '_'
data4 = data.rename(columns = lambda x: x.lower().replace(' ','_'))
data4.head(2)

Unnamed: 0,area_abbreviation,area_code,area,item_code,item,element_code,element,unit,latitude,longitude,y1961,y1962,y1963,y1964,y1965,...,y1999,y2000,y2001,y2002,y2003,y2004,y2005,y2006,y2007,y2008,y2009,y2010,y2011,y2012,y2013
0,AF,2,Afghanistan,2511,Wheat and products,5142,Food,1000 tonnes,33.94,67.71,1928.0,1904.0,1666.0,1950.0,2001.0,...,2463.0,2600.0,2668.0,2776.0,3095.0,3249.0,3486.0,3704.0,4164.0,4252.0,4538.0,4605.0,4711.0,4810,4895
1,AF,2,Afghanistan,2805,Rice (Milled Equivalent),5142,Food,1000 tonnes,33.94,67.71,183.0,183.0,182.0,220.0,220.0,...,270.0,372.0,411.0,448.0,460.0,419.0,445.0,546.0,455.0,490.0,415.0,442.0,476.0,425,422


In [120]:
# don't want row index name in output file, hence index = False
data4.to_csv("./data/output_FAO_database.csv", index = False, encoding = 'utf8')


In [121]:
# to debug 
# import xlsxwirter
# data4.to_csv("./data/output_FAO_database.xlsx", sheet_name = "Sheet 1", index = False)


In [122]:
ufo = pd.read_csv('./data/uforeports')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [123]:
ufo.columns

Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')

In [124]:
ufo.rename(columns = {'Colors Reported': 'Colors_Reported', 'Shape Reported': 'Shape_Reported'}, inplace = True)
ufo.head()

Unnamed: 0,City,Colors_Reported,Shape_Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [125]:
ufo_cols = ['city', 'colors reported', 'shape reported', 'state', 'time']
ufo.columns = ufo_cols

In [126]:
ufo.head()

Unnamed: 0,city,colors reported,shape reported,state,time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [127]:
ufo = pd.read_csv('./data/uforeports', names = ufo_cols, header = 0)
ufo.head()

Unnamed: 0,city,colors reported,shape reported,state,time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [128]:
ufo.columns = ufo.columns.str.replace(' ', '_')
ufo.columns

Index(['city', 'colors_reported', 'shape_reported', 'state', 'time'], dtype='object')

##### Select a Pandas Series from DataFrame

In [129]:
ufo.head()

Unnamed: 0,city,colors_reported,shape_reported,state,time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [130]:
type(ufo['city'])

pandas.core.series.Series

In [131]:
ufo.city + " " +ufo.state

0                      Ithaca NY
1                 Willingboro NJ
2                     Holyoke CO
3                     Abilene KS
4        New York Worlds Fair NY
                  ...           
18236              Grant Park IL
18237             Spirit Lake IA
18238             Eagle River WI
18239             Eagle River WI
18240                    Ybor FL
Length: 18241, dtype: object

In [132]:
ufo['location'] = ufo.city + ', ' + ufo.state
ufo.head()

Unnamed: 0,city,colors_reported,shape_reported,state,time,location
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,"Ithaca, NY"
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,"Willingboro, NJ"
2,Holyoke,,OVAL,CO,2/15/1931 14:00,"Holyoke, CO"
3,Abilene,,DISK,KS,6/1/1931 13:00,"Abilene, KS"
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,"New York Worlds Fair, NY"


#### Drop rows and columns

In [133]:
# old way to drop rows: specify labels and axis
ufo.drop([0,1], axis = 0).head()

Unnamed: 0,city,colors_reported,shape_reported,state,time,location
2,Holyoke,,OVAL,CO,2/15/1931 14:00,"Holyoke, CO"
3,Abilene,,DISK,KS,6/1/1931 13:00,"Abilene, KS"
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,"New York Worlds Fair, NY"
5,Valley City,,DISK,ND,9/15/1934 15:30,"Valley City, ND"
6,Crater Lake,,CIRCLE,CA,6/15/1935 0:00,"Crater Lake, CA"


In [134]:
# or
ufo.drop([0,1], axis = 'index').head()

Unnamed: 0,city,colors_reported,shape_reported,state,time,location
2,Holyoke,,OVAL,CO,2/15/1931 14:00,"Holyoke, CO"
3,Abilene,,DISK,KS,6/1/1931 13:00,"Abilene, KS"
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,"New York Worlds Fair, NY"
5,Valley City,,DISK,ND,9/15/1934 15:30,"Valley City, ND"
6,Crater Lake,,CIRCLE,CA,6/15/1935 0:00,"Crater Lake, CA"


In [135]:
# new way to drop rows: specify index
ufo.drop(index= [0, 1]).head()

Unnamed: 0,city,colors_reported,shape_reported,state,time,location
2,Holyoke,,OVAL,CO,2/15/1931 14:00,"Holyoke, CO"
3,Abilene,,DISK,KS,6/1/1931 13:00,"Abilene, KS"
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,"New York Worlds Fair, NY"
5,Valley City,,DISK,ND,9/15/1934 15:30,"Valley City, ND"
6,Crater Lake,,CIRCLE,CA,6/15/1935 0:00,"Crater Lake, CA"


In [136]:
# old way to drop columns: specify labels and axis
ufo.drop(['city', 'state'], axis = 1)


Unnamed: 0,colors_reported,shape_reported,time,location
0,,TRIANGLE,6/1/1930 22:00,"Ithaca, NY"
1,,OTHER,6/30/1930 20:00,"Willingboro, NJ"
2,,OVAL,2/15/1931 14:00,"Holyoke, CO"
3,,DISK,6/1/1931 13:00,"Abilene, KS"
4,,LIGHT,4/18/1933 19:00,"New York Worlds Fair, NY"
...,...,...,...,...
18236,,TRIANGLE,12/31/2000 23:00,"Grant Park, IL"
18237,,DISK,12/31/2000 23:00,"Spirit Lake, IA"
18238,,,12/31/2000 23:45,"Eagle River, WI"
18239,RED,LIGHT,12/31/2000 23:45,"Eagle River, WI"


In [137]:
# or
ufo.drop(['city', 'state'], axis = "columns")

Unnamed: 0,colors_reported,shape_reported,time,location
0,,TRIANGLE,6/1/1930 22:00,"Ithaca, NY"
1,,OTHER,6/30/1930 20:00,"Willingboro, NJ"
2,,OVAL,2/15/1931 14:00,"Holyoke, CO"
3,,DISK,6/1/1931 13:00,"Abilene, KS"
4,,LIGHT,4/18/1933 19:00,"New York Worlds Fair, NY"
...,...,...,...,...
18236,,TRIANGLE,12/31/2000 23:00,"Grant Park, IL"
18237,,DISK,12/31/2000 23:00,"Spirit Lake, IA"
18238,,,12/31/2000 23:45,"Eagle River, WI"
18239,RED,LIGHT,12/31/2000 23:45,"Eagle River, WI"


In [138]:
# new way to drop columns: specify columns
ufo.drop(columns = ['city', 'state']).head()

Unnamed: 0,colors_reported,shape_reported,time,location
0,,TRIANGLE,6/1/1930 22:00,"Ithaca, NY"
1,,OTHER,6/30/1930 20:00,"Willingboro, NJ"
2,,OVAL,2/15/1931 14:00,"Holyoke, CO"
3,,DISK,6/1/1931 13:00,"Abilene, KS"
4,,LIGHT,4/18/1933 19:00,"New York Worlds Fair, NY"


#### rename columns

In [139]:
# old way to rename columns: specify columns
ufo.rename(columns = {'city':'CITY', 'state':'STATE'}).head()

Unnamed: 0,CITY,colors_reported,shape_reported,STATE,time,location
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,"Ithaca, NY"
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,"Willingboro, NJ"
2,Holyoke,,OVAL,CO,2/15/1931 14:00,"Holyoke, CO"
3,Abilene,,DISK,KS,6/1/1931 13:00,"Abilene, KS"
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,"New York Worlds Fair, NY"


In [140]:
# new way to rename columns: specify mapper and axis
ufo.rename({'city':'CITY', 'state':'STATE'}, axis = "columns").head()

Unnamed: 0,CITY,colors_reported,shape_reported,STATE,time,location
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,"Ithaca, NY"
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,"Willingboro, NJ"
2,Holyoke,,OVAL,CO,2/15/1931 14:00,"Holyoke, CO"
3,Abilene,,DISK,KS,6/1/1931 13:00,"Abilene, KS"
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,"New York Worlds Fair, NY"


In [141]:
# mapper can be a function
ufo.rename(str.upper, axis = "columns").head()

Unnamed: 0,CITY,COLORS_REPORTED,SHAPE_REPORTED,STATE,TIME,LOCATION
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,"Ithaca, NY"
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,"Willingboro, NJ"
2,Holyoke,,OVAL,CO,2/15/1931 14:00,"Holyoke, CO"
3,Abilene,,DISK,KS,6/1/1931 13:00,"Abilene, KS"
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,"New York Worlds Fair, NY"


##### pandas methods and attributes 

In [142]:
movies = pd.read_csv('./data/imdbratings')
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [143]:
movies.describe()

Unnamed: 0,star_rating,duration
count,979.0,979.0
mean,7.889785,120.979571
std,0.336069,26.21801
min,7.4,64.0
25%,7.6,102.0
50%,7.8,117.0
75%,8.1,134.0
max,9.3,242.0


In [144]:
movies.shape

(979, 6)

In [145]:
movies.dtypes

star_rating       float64
title              object
content_rating     object
genre              object
duration            int64
actors_list        object
dtype: object

In [146]:
movies.describe(include = ['object'])

Unnamed: 0,title,content_rating,genre,actors_list
count,979,976,979,979
unique,975,12,16,969
top,Les Miserables,R,Drama,"[u'Daniel Radcliffe', u'Emma Watson', u'Rupert..."
freq,2,460,278,6


##### Summary Functions

In [147]:
import pandas as pd
pd.set_option('max_rows', 5)
import numpy as np
reviews = pd.read_csv("./data/winemag-data-130k-v2.csv", index_col = 0)
reviews.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [148]:
reviews.price.dtype     # a specific column dtype
reviews.dtypes          # whole df dtypes

country        object
description    object
                ...  
variety        object
winery         object
Length: 13, dtype: object

In [149]:
# numerical data type describe
reviews.points.describe()

count    129971.000000
mean         88.447138
             ...      
75%          91.000000
max         100.000000
Name: points, Length: 8, dtype: float64

In [150]:
# string data type describe
reviews.taster_name.describe()

count         103727
unique            19
top       Roger Voss
freq           25514
Name: taster_name, dtype: object

In [151]:
reviews.info()   # of entries, columns, non-null, memory, columns dtype, etc

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129971 entries, 0 to 129970
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   country                129908 non-null  object 
 1   description            129971 non-null  object 
 2   designation            92506 non-null   object 
 3   points                 129971 non-null  int64  
 4   price                  120975 non-null  float64
 5   province               129908 non-null  object 
 6   region_1               108724 non-null  object 
 7   region_2               50511 non-null   object 
 8   taster_name            103727 non-null  object 
 9   taster_twitter_handle  98758 non-null   object 
 10  title                  129971 non-null  object 
 11  variety                129970 non-null  object 
 12  winery                 129971 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 13.9+ MB


In [152]:
# to see mean of points
reviews.points.mean()

88.44713820775404

In [153]:
# to see a list of unique string values. Returns a ndarray dtype
reviews.taster_name.unique()
#len(reviews.taster_name.unique())


array(['Kerin O’Keefe', 'Roger Voss', 'Paul Gregutt',
       'Alexander Peartree', 'Michael Schachner', 'Anna Lee C. Iijima',
       'Virginie Boone', 'Matt Kettmann', nan, 'Sean P. Sullivan',
       'Jim Gordon', 'Joe Czerwinski', 'Anne Krebiehl\xa0MW',
       'Lauren Buzzeo', 'Mike DeSimone', 'Jeff Jenssen',
       'Susan Kostrzewa', 'Carrie Dykes', 'Fiona Adams',
       'Christina Pickard'], dtype=object)

In [154]:
# to see a list of unique values and how often they occur in the dataset
reviews.taster_name.value_counts()

Roger Voss           25514
Michael Schachner    15134
                     ...  
Fiona Adams             27
Christina Pickard        6
Name: taster_name, Length: 19, dtype: int64

##### map

In [155]:
# demean the points
reviews_points_mean = reviews.points.mean()
reviews.points.map(lambda p: p - reviews_points_mean)  #not inplace


0        -1.447138
1        -1.447138
            ...   
129969    1.552862
129970    1.552862
Name: points, Length: 129971, dtype: float64

In [156]:
#  to debug
def remain_points(row):
    row.points = row.points - reviews_points_mean
    return row

reviews.apply(remain_points, axis = "columns")
reviews

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss
129970,France,"Big, rich and off-dry, this is powered by inte...",Lieu-dit Harth Cuvée Caroline,90,21.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Schoffit 2012 Lieu-dit Harth Cuvée Car...,Gewürztraminer,Domaine Schoffit


In [157]:
reviews_points_mean = reviews.points.mean()
reviews2 = reviews.points - reviews_points_mean
reviews2

0        -1.447138
1        -1.447138
            ...   
129969    1.552862
129970    1.552862
Name: points, Length: 129971, dtype: float64

In [158]:
reviews.country + '-' + reviews.region_1

0            Italy-Etna
1                   NaN
              ...      
129969    France-Alsace
129970    France-Alsace
Length: 129971, dtype: object

##### Groupby

In [159]:
# group by 'points' and count the occurance of each point
reviews.groupby("points").points.count()

points
80     397
81     692
      ... 
99      33
100     19
Name: points, Length: 21, dtype: int64

In [160]:
# to get the cheapest wine in each point value category
reviews.groupby("points").price.min()

points
80      5.0
81      5.0
       ... 
99     44.0
100    80.0
Name: price, Length: 21, dtype: float64

In [161]:
# to select the name of the first wine reivewed from each winery in the dataset
reviews.groupby("winery").apply(lambda df: df.title.iloc[0])

winery
1+1=3                          1+1=3 NV Rosé Sparkling (Cava)
10 Knots                 10 Knots 2010 Viognier (Paso Robles)
                                  ...                        
àMaurice    àMaurice 2013 Fred Estate Syrah (Walla Walla V...
Štoka                         Štoka 2009 Izbrani Teran (Kras)
Length: 16757, dtype: object

In [162]:
reviews.points.idxmin()

344

In [163]:
# to pick out the best wine by country and province
# pd.Series.idxmax() returns the index of a series's max value
reivews.groupby(["country", "province"]).apply(lambda df: df.loc[df.points.idxmax()])

NameError: name 'reivews' is not defined

In [None]:
# agg() lets you run a bunch of different functions on df simultaneously
reviews.groupby("country").price.agg([len, min, max])

##### multi-index

In [None]:
countries_reviewed = reviews.groupby(['country', 'province']).description.agg([len])
countries_reviewed

In [None]:
# to convert back to a regular index
countries_reviewed.reset_index()

In [None]:
countries_reviewed.sort_values(by = 'len', ascending = False)

In [None]:
countries_reviewed = countries_reviewed.reset_index()
countries_reviewed.sort_index()

##### Missing Data

In [None]:
# pd.isnull() and pd.notnull()
reviews[pd.isnull(reviews.country)]

In [None]:
# pd.fillna() to fill NaN 
reviews.region_2.fillna("Unknown")

In [None]:
# pd.replace()
reviews.taster_twitter_handle.replace("@kerinokeefe","@kerino")

In [None]:
melb_data= pd.read_csv("./data/melb_data.csv")
melb_data.head()

In [None]:
# to get the names of columns with missing value
cols_with_missing = [ col for col in melb_data.columns
                    if melb_data[col].isnull().any()]
cols_with_missing

##### 1. Drop columns with missing value

In [None]:
reduced_melb_data = melb_data.drop(cols_with_missing, axis = 1)
reduced_melb_data.shape

##### 2. use SimpleImputer to replace missing values with the mean value along each column

In [None]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()  # default strategy =mean()

melb_predictors = melb_data.drop('Price', axis = 1)      # predictors columns remain
X = melb_predictors.select_dtypes(exclude = 'object')    # only numeric columns remain

# Imputation
imputed_melb_data = pd.DataFrame(my_imputer.fit_transform(X))

# imputation removed column names, put them back
imputed_melb_data.columns = X.columns
imputed_melb_data.head()


##### 3. Impute the missing values, while also keeping track of which values were imputed

In [None]:
# make copy to avoid changing original data (when imputing)
X_plus = X.copy()

cols_with_missing = [ col for col in X.columns
                    if X[col].isnull().any()]

# make new columns indicating what will be imputed
for col in cols_with_missing:
    X_plus[col + '_was_missing'] = X_plus[col].isnull()
    
# Imputation
my_imputer = SimpleImputer()
imputed_X_plus = pd.DataFrame(my_imputer.fit_transform(X_plus))

# Put column names back
imputed_X_plus.columns = X_plus.columns
imputed_X_plus

##### Visualization

In [None]:
import matplotlib.pyplot as plt

data['latitude'].plot(kind = 'hist', bins = 100)
plt.xlabel('Latitude Value')
plt.show()

In [None]:
plot_data = data[data['Element'] == 'Food']
plot_data = plot_data.groupby('Area')['Y2013'].sum()
plot_data.sort_values()[-10:].plot(kind = 'bar')
plt.title("Top Ten Food Producers")
plt.ylabel("Food produced(tonnes)")
plt.show()

# How to handle missing data

In [None]:
ufo = pd.read_csv('./data/uforeports')
ufo.head()

In [None]:
ufo.isnull().tail()

In [None]:
ufo.notnull().tail()

In [None]:
ufo.isnull().sum()  # default axis = 0, column sum

In [None]:
ufo.isnull().sum(axis = 0)

In [None]:
pd.Series([True, False, True]).sum()

In [None]:
ufo[ufo.City.isnull()]    # only show the rows with null City

## Solution1: drop null value

In [None]:
ufo.shape

In [None]:
# Drop a row in which any of its columns has missing value.
ufo.dropna(how = 'any').shape    

In [None]:
# Drop a row in which all of its columns are missing.
ufo.dropna(how = 'all').shape

In [None]:
# drop a row in which either City or 'Shape Reported' column is missing.
ufo.dropna(subset = ['City', 'Shape Reported'], how = 'any').shape

In [None]:
ufo.dropna(subset = ['City', 'Shape Reported'], how = 'all').shape

In [None]:
ufo['Shape Reported'].value_counts()  

In [None]:
ufo['Shape Reported'].value_counts(dropna= False)

## Solution 2: fill NaN

In [None]:
ufo['Shape Reported'].fillna(value = "VARIOUS", inplace = True)

In [None]:
ufo.isnull().sum()