##### Pandas DataFrame //
Create DataFrame *Manually*

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 30)

In [2]:
# Create a DataFrame through a two-dimensional array
df1 = pd.DataFrame([
    ['joe', 'san', 'andy'], 
    [70, 80, 90],
])
df1

Unnamed: 0,0,1,2
0,joe,san,andy
1,70,80,90


In [3]:
# Create a DataFrame with index
df2 = pd.DataFrame([
    ['joe', 'san', 'andy'], 
    [70, 80, 90],
], index = ['first', 'second'],     # index parameter for ROW index
    columns =['a', 'b', 'c'] )      # columns parameter for COLUMN index

df2

Unnamed: 0,a,b,c
first,joe,san,andy
second,70,80,90


In [4]:
df2.index

Index(['first', 'second'], dtype='object')

In [5]:
df2.columns

Index(['a', 'b', 'c'], dtype='object')

In [6]:
df2.values

array([['joe', 'san', 'andy'],
       [70, 80, 90]], dtype=object)

In [7]:
# Create a DataFrame through a dictionary
df3 = pd.DataFrame({
    'name': ['joe', 'san', 'andy'],     # 'key' becomes COLUMN index
    'sex': ['male', 'male', 'female'],
    'age':[10, 20, 30],
    'id': 3,
}, index = ['first', 'second', 'third']) # "index" parameter defines ROW index
df3

Unnamed: 0,name,sex,age,id
first,joe,male,10,3
second,san,male,20,3
third,andy,female,30,3


In [8]:
df3.index

Index(['first', 'second', 'third'], dtype='object')

In [9]:
df3.columns

Index(['name', 'sex', 'age', 'id'], dtype='object')

In [10]:
df3.values

array([['joe', 'male', 10, 3],
       ['san', 'male', 20, 3],
       ['andy', 'female', 30, 3]], dtype=object)

In [11]:
df3['name']

first      joe
second     san
third     andy
Name: name, dtype: object

get data and crud operations

In [12]:
# append a column
df3['city'] = ['beijing', 'shanghai', 'chengdu']
df3

Unnamed: 0,name,sex,age,id,city
first,joe,male,10,3,beijing
second,san,male,20,3,shanghai
third,andy,female,30,3,chengdu


In [13]:
# remove a column
df3.pop('city')

first      beijing
second    shanghai
third      chengdu
Name: city, dtype: object

In [14]:
df3


Unnamed: 0,name,sex,age,id
first,joe,male,10,3
second,san,male,20,3
third,andy,female,30,3


In [15]:
df3['age'] = [18,19,20]
df3

Unnamed: 0,name,sex,age,id
first,joe,male,18,3
second,san,male,19,3
third,andy,female,20,3


In [16]:
df3.loc['second']

name     san
sex     male
age       19
id         3
Name: second, dtype: object

In [17]:
df3.loc['second',['age', 'sex']]

age      19
sex    male
Name: second, dtype: object

In [18]:
df3.loc['fourth'] = ['peter', 'female', 22, 4]  # insert a row

In [19]:
df3

Unnamed: 0,name,sex,age,id
first,joe,male,18,3
second,san,male,19,3
third,andy,female,20,3
fourth,peter,female,22,4


In [20]:
df3.drop('fourth')   # remove a row

Unnamed: 0,name,sex,age,id
first,joe,male,18,3
second,san,male,19,3
third,andy,female,20,3


In [21]:
weather_data = {
    'day': ['1/1/2020', '1/2/2020', '1/3/2020', '1/4/2020', '1/5/2020', '1/6/2020'],
    'temperature': [32, 35, 28, 24, 32, 31],
    'windspeed': [6,7,2,7,4,2],
    'event': ['Rain', 'Sunny', 'Snow', 'Snow', 'Rain', 'Windy']
}
df4 = pd.DataFrame(weather_data)
df4.head(6)

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2020,32,6,Rain
1,1/2/2020,35,7,Sunny
2,1/3/2020,28,2,Snow
3,1/4/2020,24,7,Snow
4,1/5/2020,32,4,Rain
5,1/6/2020,31,2,Windy


In [22]:
df4.shape

(6, 4)

In [23]:
df4.tail()

Unnamed: 0,day,temperature,windspeed,event
1,1/2/2020,35,7,Sunny
2,1/3/2020,28,2,Snow
3,1/4/2020,24,7,Snow
4,1/5/2020,32,4,Rain
5,1/6/2020,31,2,Windy


In [24]:
df4.iloc[-1, :]

day            1/6/2020
temperature          31
windspeed             2
event             Windy
Name: 5, dtype: object

In [25]:
df4.columns

Index(['day', 'temperature', 'windspeed', 'event'], dtype='object')

In [26]:
df4.event

0     Rain
1    Sunny
2     Snow
3     Snow
4     Rain
5    Windy
Name: event, dtype: object

In [27]:
df4.dtypes

day            object
temperature     int64
windspeed       int64
event          object
dtype: object

In [28]:
df4 = df4[['day', 'event', 'windspeed', 'temperature']]
df4.head()

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,7,35
2,1/3/2020,Snow,2,28
3,1/4/2020,Snow,7,24
4,1/5/2020,Rain,4,32


In [29]:
print(df4.describe())

       windspeed  temperature
count   6.000000     6.000000
mean    4.666667    30.333333
std     2.338090     3.829708
min     2.000000    24.000000
25%     2.500000    28.750000
50%     5.000000    31.500000
75%     6.750000    32.000000
max     7.000000    35.000000


#### Pandas Select

In [30]:
df4.loc[df4.temperature == df4.temperature.max(), :]

Unnamed: 0,day,event,windspeed,temperature
1,1/2/2020,Sunny,7,35


In [31]:
df4[df4.temperature >= 32]

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,7,35
4,1/5/2020,Rain,4,32


In [32]:
df4[df4.temperature == df4.temperature.max()]

Unnamed: 0,day,event,windspeed,temperature
1,1/2/2020,Sunny,7,35


In [33]:
df4.loc[df4['temperature'] == df4['temperature'].max(), ['day', 'windspeed', 'event']]

Unnamed: 0,day,windspeed,event
1,1/2/2020,7,Sunny


# Index

In [34]:
data = pd.Series(['a','b', 'c'], index = [1,3,5])
print(data[5])      #explicit index, use loc instead
print(data[1:3])    #implicit index, use iloc instead


c
3    b
5    c
dtype: object


In [35]:
df4

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,7,35
2,1/3/2020,Snow,2,28
3,1/4/2020,Snow,7,24
4,1/5/2020,Rain,4,32
5,1/6/2020,Windy,2,31


In [36]:
df4.set_index('day', inplace = True)
df4

Unnamed: 0_level_0,event,windspeed,temperature
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2020,Rain,6,32
1/2/2020,Sunny,7,35
1/3/2020,Snow,2,28
1/4/2020,Snow,7,24
1/5/2020,Rain,4,32
1/6/2020,Windy,2,31


In [37]:
df4.loc['1/4/2020', :]

event          Snow
windspeed         7
temperature      24
Name: 1/4/2020, dtype: object

In [38]:
df4.reset_index(inplace = True)
df4

Unnamed: 0,day,event,windspeed,temperature
0,1/1/2020,Rain,6,32
1,1/2/2020,Sunny,7,35
2,1/3/2020,Snow,2,28
3,1/4/2020,Snow,7,24
4,1/5/2020,Rain,4,32
5,1/6/2020,Windy,2,31


In [39]:
df4.set_index('event', inplace = True)
df4

Unnamed: 0_level_0,day,windspeed,temperature
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rain,1/1/2020,6,32
Sunny,1/2/2020,7,35
Snow,1/3/2020,2,28
Snow,1/4/2020,7,24
Rain,1/5/2020,4,32
Windy,1/6/2020,2,31


In [40]:
df4.loc['Snow', :]

Unnamed: 0_level_0,day,windspeed,temperature
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Snow,1/3/2020,2,28
Snow,1/4/2020,7,24


In [41]:
df4.reset_index(inplace = True)
df4

Unnamed: 0,event,day,windspeed,temperature
0,Rain,1/1/2020,6,32
1,Sunny,1/2/2020,7,35
2,Snow,1/3/2020,2,28
3,Snow,1/4/2020,7,24
4,Rain,1/5/2020,4,32
5,Windy,1/6/2020,2,31


In [42]:
drinks = pd.read_csv('./data/drinksbycountry')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [43]:
drinks.index

RangeIndex(start=0, stop=193, step=1)

In [44]:
drinks.columns

Index(['country', 'beer_servings', 'spirit_servings', 'wine_servings',
       'total_litres_of_pure_alcohol', 'continent'],
      dtype='object')

## Index for identification

In [45]:
drinks[drinks.continent == "South America"]

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
6,Argentina,193,25,221,8.3,South America
20,Bolivia,167,41,8,3.8,South America
23,Brazil,245,145,16,7.2,South America
35,Chile,130,124,172,7.6,South America
37,Colombia,159,76,3,4.2,South America
52,Ecuador,162,74,3,4.2,South America
72,Guyana,93,302,1,7.1,South America
132,Paraguay,213,117,74,7.3,South America
133,Peru,163,160,21,6.1,South America
163,Suriname,128,178,7,5.6,South America


## Index for selection

In [46]:
drinks.loc[20, 'beer_servings']

167

In [47]:
drinks.set_index('country', inplace = True)
drinks.head()

Unnamed: 0_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,0,0,0,0.0,Asia
Albania,89,132,54,4.9,Europe
Algeria,25,0,14,0.7,Africa
Andorra,245,138,312,12.4,Europe
Angola,217,57,45,5.9,Africa


In [48]:
drinks.index

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'Tanzania', 'USA', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela',
       'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', name='country', length=193)

In [49]:
drinks.loc['Brazil', 'beer_servings']

245

In [50]:
drinks.index.name = None
drinks.head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
Afghanistan,0,0,0,0.0,Asia
Albania,89,132,54,4.9,Europe
Algeria,25,0,14,0.7,Africa
Andorra,245,138,312,12.4,Europe
Angola,217,57,45,5.9,Africa


In [51]:
drinks.index.name = 'country'
drinks.reset_index(inplace = True)
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [52]:
drinks.describe()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
count,193.0,193.0,193.0,193.0
mean,106.160622,80.994819,49.450777,4.717098
std,101.143103,88.284312,79.697598,3.773298
min,0.0,0.0,0.0,0.0
25%,20.0,4.0,1.0,1.3
50%,76.0,56.0,8.0,4.2
75%,188.0,128.0,59.0,7.2
max,376.0,438.0,370.0,14.4


In [53]:
drinks.describe().index

Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], dtype='object')

In [54]:
drinks.describe().loc['25%', 'beer_servings']

20.0

In [55]:
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: object

In [56]:
drinks.reset_index(inplace= True)
drinks.head()

Unnamed: 0,index,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,0,Afghanistan,0,0,0,0.0,Asia
1,1,Albania,89,132,54,4.9,Europe
2,2,Algeria,25,0,14,0.7,Africa
3,3,Andorra,245,138,312,12.4,Europe
4,4,Angola,217,57,45,5.9,Africa


In [57]:
drinks.set_index('country', inplace = True)
drinks.head()

Unnamed: 0_level_0,index,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,0,0,0,0,0.0,Asia
Albania,1,89,132,54,4.9,Europe
Algeria,2,25,0,14,0.7,Africa
Andorra,3,245,138,312,12.4,Europe
Angola,4,217,57,45,5.9,Africa


In [58]:
drinks.continent.head()

country
Afghanistan      Asia
Albania        Europe
Algeria        Africa
Andorra        Europe
Angola         Africa
Name: continent, dtype: object

In [59]:
drinks.continent.value_counts()

Africa           53
Europe           45
Asia             44
North America    23
Oceania          16
South America    12
Name: continent, dtype: int64

In [60]:
drinks.continent.value_counts().values

array([53, 45, 44, 23, 16, 12])

In [61]:
drinks.continent.value_counts()['Asia']

44

In [62]:
drinks.continent.value_counts().sort_values()

South America    12
Oceania          16
North America    23
Asia             44
Europe           45
Africa           53
Name: continent, dtype: int64

In [63]:
drinks.continent.value_counts().sort_index()

Africa           53
Asia             44
Europe           45
North America    23
Oceania          16
South America    12
Name: continent, dtype: int64

## Index for alignment

In [64]:
people = pd.Series([3000000, 85000], index = ['Albania', 'Andorra'], name = 'population')
people

Albania    3000000
Andorra      85000
Name: population, dtype: int64

In [65]:
drinks.beer_servings * people

Afghanistan            NaN
Albania        267000000.0
Algeria                NaN
Andorra         20825000.0
Angola                 NaN
                  ...     
Venezuela              NaN
Vietnam                NaN
Yemen                  NaN
Zambia                 NaN
Zimbabwe               NaN
Length: 193, dtype: float64

In [66]:
pd.concat([drinks, people], axis = 1).head()

Unnamed: 0,index,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent,population
Afghanistan,0,0,0,0,0.0,Asia,
Albania,1,89,132,54,4.9,Europe,3000000.0
Algeria,2,25,0,14,0.7,Africa,
Andorra,3,245,138,312,12.4,Europe,85000.0
Angola,4,217,57,45,5.9,Africa,


# Category Type

In [67]:
drinks.reset_index(inplace = True)
drinks.head()

Unnamed: 0,country,index,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0,0.0,Asia
1,Albania,1,89,132,54,4.9,Europe
2,Algeria,2,25,0,14,0.7,Africa
3,Andorra,3,245,138,312,12.4,Europe
4,Angola,4,217,57,45,5.9,Africa


In [68]:
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   index                         193 non-null    int64  
 2   beer_servings                 193 non-null    int64  
 3   spirit_servings               193 non-null    int64  
 4   wine_servings                 193 non-null    int64  
 5   total_litres_of_pure_alcohol  193 non-null    float64
 6   continent                     193 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 10.7+ KB


In [69]:
drinks.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   index                         193 non-null    int64  
 2   beer_servings                 193 non-null    int64  
 3   spirit_servings               193 non-null    int64  
 4   wine_servings                 193 non-null    int64  
 5   total_litres_of_pure_alcohol  193 non-null    float64
 6   continent                     193 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 32.0 KB


In [70]:
drinks.memory_usage(deep = True)    # in bytes

Index                             128
country                         12588
index                            1544
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

In [71]:
drinks.memory_usage(deep = True).sum()

32768

In [72]:
sorted(drinks.continent.unique())

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [73]:
drinks['continent'] = drinks.continent.astype('category')
drinks.dtypes

country                           object
index                              int64
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [74]:
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: category
Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]

In [75]:
drinks.memory_usage(deep = True)

Index                             128
country                         12588
index                            1544
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [76]:
drinks.continent.cat.codes.head()

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [77]:
drinks['country']= drinks.country.astype('category')
drinks.dtypes

country                         category
index                              int64
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [78]:
drinks.memory_usage(deep = True)    # country column use more memory. why? because there are still 193 countries

Index                             128
country                         18094
index                            1544
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [79]:
df = pd.DataFrame({'ID': [100, 101, 102, 103], 'quality': ['good', 'very good', 'good', 'excellent']})
df.head()

Unnamed: 0,ID,quality
0,100,good
1,101,very good
2,102,good
3,103,excellent


In [80]:
pd.__version__

'1.0.3'

In [83]:
df.sort_values('quality')

Unnamed: 0,ID,quality
3,103,excellent
0,100,good
2,102,good
1,101,very good


In [84]:
df['quality'] = df.quality.astype('category', categories=['good', 'very good', 'excellent'], ordered = True)
df.head()

TypeError: astype() got an unexpected keyword argument 'categories'

In [None]:
df.sort_values('quality')

In [None]:
df.loc[df.quality > 'good']

##### Import Data from CSV

In [None]:
filepath = "./data/FAO_database.csv"
data = pd.read_csv(filepath, encoding = "ISO-8859-1")
type(data)

In [None]:
# preview the data
data.head()


In [None]:
data.shape

In [None]:
data.ndim

In [None]:
# show columns data type
data.dtypes

In [None]:
# change column 'Item Code' to str type
data['Item Code'].astype(str)

In [None]:
# show summary information
data['Y2013'].describe()   # float64 summary information

In [None]:
data['Area'].describe()   # obj summary information

In [None]:
data.describe()   # return numeric columns' summary info

##### Select Columns
1. data.column_name
2. data['column_name']
3. data.iloc[:,<column_number>]
4. data.loc[:, ['comumn_name', ...]]

In [None]:
# data.column_name
data.Area.head()

In [None]:
# data['column_name']
data[['Area', 'Item']].head()

In [None]:
# data['column_name'] get a Series, then can Series operation
ret = [data['Y2013'].sum(),data['Y2013'].mean(),data['Y2013'].median(),
      data['Y2013'].nunique(),data['Y2013'].max(),data['Y2013'].min(),]
ret

In [None]:
# data.iloc[...]
data.iloc[:,[0,1,3,5]]

In [None]:
data.iloc[:,[1,2]].head()

##### Select Rows
1. data.iloc[0:10, :]
2. data.loc['index', :]
3. conditional selection

In [None]:
# data.iloc[,]
data.iloc[[1,2],:].head()

In [None]:
# data.loc['index', ]
data.loc[2,:]        # select the row with index == 2

In [None]:
# conditional selection
data[ data['Area'] == 'Ireland'].head()

In [None]:
# the following two have the same results
data['latitude'] >= 0
data.loc[:, 'latitude'] >= 0

In [None]:
# same results
data[data['latitude'] >= 0]
data[data.loc[:,'latitude'] >= 0]

In [None]:
data.loc[data['latitude'] >= 0, 'Area':]

##### Iterate DataFrame

In [None]:
for index, row in data.iterrows():
    print(row[0])

##### Delete columns

In [None]:
data1 = data.drop("Area", axis = 1)    # axis = 1 for column
data1.head()

In [None]:
data2 = data.drop(columns = ['Area', "Area Code"])
data2.head()

In [None]:
# data.drop("Area", axis = 1, inplace = True)
# original 'data' object is *CHANGED* when inplace = True

##### Delete Rows

In [None]:
# drop() delete rows based on *INDEX NAME* , not numeric index
data.head(3)

In [None]:
data3 = data.drop([0,1], axis = 0)   # 0 and 1 are "INDEX NAME"
data3.head(3)

In [None]:
# inplace = True, axis = 0 also delete from 'original' data object

##### Rename Columns Name

In [None]:
data4 = data.rename(columns = {
    "Area": "place_name",
    "Y2013": "year_2013"
}, inplace = False)
data4.head(3)

In [None]:
# change columns name to lowercase and replace whitespace with '_'
data4 = data.rename(columns = lambda x: x.lower().replace(' ','_'))
data4.head(2)

In [None]:
# don't want row index name in output file, hence index = False
data4.to_csv("./data/output_FAO_database.csv", index = False, encoding = 'utf8')


In [None]:
# to debug 
# import xlsxwirter
# data4.to_csv("./data/output_FAO_database.xlsx", sheet_name = "Sheet 1", index = False)


In [None]:
ufo = pd.read_csv('./data/uforeports')
ufo.head()

In [None]:
ufo.columns

In [None]:
ufo.rename(columns = {'Colors Reported': 'Colors_Reported', 'Shape Reported': 'Shape_Reported'}, inplace = True)
ufo.head()

In [None]:
ufo_cols = ['city', 'colors reported', 'shape reported', 'state', 'time']
ufo.columns = ufo_cols

In [None]:
ufo.head()

In [None]:
ufo = pd.read_csv('./data/uforeports', names = ufo_cols, header = 0)
ufo.head()

In [None]:
ufo.columns = ufo.columns.str.replace(' ', '_')
ufo.columns

##### Select a Pandas Series from DataFrame

In [None]:
ufo.head()

In [None]:
type(ufo['city'])

In [None]:
ufo.city + " " +ufo.state

In [None]:
ufo['location'] = ufo.city + ', ' + ufo.state
ufo.head()

##### pandas methods and attributes 

In [None]:
movies = pd.read_csv('./data/imdbratings')
movies.head()

In [None]:
movies.describe()

In [None]:
movies.shape

In [None]:
movies.dtypes

In [None]:
movies.describe(include = ['object'])

##### Summary Functions

In [None]:
import pandas as pd
pd.set_option('max_rows', 5)
import numpy as np
reviews = pd.read_csv("./data/winemag-data-130k-v2.csv", index_col = 0)
reviews.head()

In [None]:
reviews.price.dtype     # a specific column dtype
reviews.dtypes          # whole df dtypes

In [None]:
# numerical data type describe
reviews.points.describe()

In [None]:
# string data type describe
reviews.taster_name.describe()

In [None]:
reviews.info()   # of entries, columns, non-null, memory, columns dtype, etc

In [None]:
# to see mean of points
reviews.points.mean()

In [None]:
# to see a list of unique string values. Returns a ndarray dtype
reviews.taster_name.unique()
#len(reviews.taster_name.unique())


In [None]:
# to see a list of unique values and how often they occur in the dataset
reviews.taster_name.value_counts()

##### map

In [None]:
# demean the points
reviews_points_mean = reviews.points.mean()
reviews.points.map(lambda p: p - reviews_points_mean)  #not inplace


In [None]:
#  to debug
def remain_points(row):
    row.points = row.points - reviews_points_mean
    return row

reviews.apply(remain_points, axis = "columns")
reviews

In [None]:
reviews_points_mean = reviews.points.mean()
reviews2 = reviews.points - reviews_points_mean
reviews2

In [None]:
reviews.country + '-' + reviews.region_1

##### Groupby

In [None]:
# group by 'points' and count the occurance of each point
reviews.groupby("points").points.count()

In [None]:
# to get the cheapest wine in each point value category
reviews.groupby("points").price.min()

In [None]:
# to select the name of the first wine reivewed from each winery in the dataset
reviews.groupby("winery").apply(lambda df: df.title.iloc[0])

In [None]:
reviews.points.idxmin()

In [None]:
# to pick out the best wine by country and province
# pd.Series.idxmax() returns the index of a series's max value
reivews.groupby(["country", "province"]).apply(lambda df: df.loc[df.points.idxmax()])

In [None]:
# agg() lets you run a bunch of different functions on df simultaneously
reviews.groupby("country").price.agg([len, min, max])

##### multi-index

In [None]:
countries_reviewed = reviews.groupby(['country', 'province']).description.agg([len])
countries_reviewed

In [None]:
# to convert back to a regular index
countries_reviewed.reset_index()

In [None]:
countries_reviewed.sort_values(by = 'len', ascending = False)

In [None]:
countries_reviewed = countries_reviewed.reset_index()
countries_reviewed.sort_index()

##### Missing Data

In [None]:
# pd.isnull() and pd.notnull()
reviews[pd.isnull(reviews.country)]

In [None]:
# pd.fillna() to fill NaN 
reviews.region_2.fillna("Unknown")

In [None]:
# pd.replace()
reviews.taster_twitter_handle.replace("@kerinokeefe","@kerino")

In [None]:
melb_data= pd.read_csv("./data/melb_data.csv")
melb_data.head()

In [None]:
# to get the names of columns with missing value
cols_with_missing = [ col for col in melb_data.columns
                    if melb_data[col].isnull().any()]
cols_with_missing

##### 1. Drop columns with missing value

In [None]:
reduced_melb_data = melb_data.drop(cols_with_missing, axis = 1)
reduced_melb_data.shape

##### 2. use SimpleImputer to replace missing values with the mean value along each column

In [None]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()  # default strategy =mean()

melb_predictors = melb_data.drop('Price', axis = 1)      # predictors columns remain
X = melb_predictors.select_dtypes(exclude = 'object')    # only numeric columns remain

# Imputation
imputed_melb_data = pd.DataFrame(my_imputer.fit_transform(X))

# imputation removed column names, put them back
imputed_melb_data.columns = X.columns
imputed_melb_data.head()


##### 3. Impute the missing values, while also keeping track of which values were imputed

In [None]:
# make copy to avoid changing original data (when imputing)
X_plus = X.copy()

cols_with_missing = [ col for col in X.columns
                    if X[col].isnull().any()]

# make new columns indicating what will be imputed
for col in cols_with_missing:
    X_plus[col + '_was_missing'] = X_plus[col].isnull()
    
# Imputation
my_imputer = SimpleImputer()
imputed_X_plus = pd.DataFrame(my_imputer.fit_transform(X_plus))

# Put column names back
imputed_X_plus.columns = X_plus.columns
imputed_X_plus

##### Visualization

In [None]:
import matplotlib.pyplot as plt

data['latitude'].plot(kind = 'hist', bins = 100)
plt.xlabel('Latitude Value')
plt.show()

In [None]:
plot_data = data[data['Element'] == 'Food']
plot_data = plot_data.groupby('Area')['Y2013'].sum()
plot_data.sort_values()[-10:].plot(kind = 'bar')
plt.title("Top Ten Food Producers")
plt.ylabel("Food produced(tonnes)")
plt.show()

# How to handle missing data

In [None]:
ufo = pd.read_csv('./data/uforeports')
ufo.head()

In [None]:
ufo.isnull().tail()

In [None]:
ufo.notnull().tail()

In [None]:
ufo.isnull().sum()  # default axis = 0, column sum

In [None]:
ufo.isnull().sum(axis = 0)

In [None]:
pd.Series([True, False, True]).sum()

In [None]:
ufo[ufo.City.isnull()]    # only show the rows with null City

## Solution1: drop null value

In [None]:
ufo.shape

In [None]:
# Drop a row in which any of its columns has missing value.
ufo.dropna(how = 'any').shape    

In [None]:
# Drop a row in which all of its columns are missing.
ufo.dropna(how = 'all').shape

In [None]:
# drop a row in which either City or 'Shape Reported' column is missing.
ufo.dropna(subset = ['City', 'Shape Reported'], how = 'any').shape

In [None]:
ufo.dropna(subset = ['City', 'Shape Reported'], how = 'all').shape

In [None]:
ufo['Shape Reported'].value_counts()

In [None]:
ufo['Shape Reported'].value_counts(dropna= False)

## Solution 2: fill NaN

In [None]:
ufo['Shape Reported'].fillna(value = "VARIOUS", inplace = True)

In [None]:
ufo.isnull().sum()