# First look at Pandas

In [None]:
import pandas as pd
import numpy as np

Series and DataFrames
---

In [None]:
my_Series = pd.Series([1,'cat',10.2,'dog'])
my_Series

In [None]:
my_Series[1]

In [None]:
ages = pd.Series([20,53,68], index=['John', 'Allen', 'Mary'])
ages

In [None]:
ages['Mary']

In [None]:
df = pd.DataFrame({
    'user' : [1,2,3],
    'age' : [24,54,17],
    'sex' : ['F','F','M'],
    'occupation' : ['technician','musician','student']
})

In [None]:
df

In [None]:
df.set_index('user')

In [None]:
df.info()

In [None]:
df.index

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.describe()

Manipulating the data
---

In [None]:
df = pd.DataFrame(np.arange(9).reshape(3,3), columns=['a','b', 'c'])

In [None]:
df.drop(0, axis=0)

In [None]:
df.drop([0,2], axis=0)

In [None]:
df.drop(['b','c'], axis=1)

In [None]:
df['a'] + df['b']

In [None]:
df['a'].add(df['b'])

In [None]:
df.add(df.loc[0:1,:])

In [None]:
df.add(df.loc[0:1,:], fill_value=0)

In [None]:
df1= pd.DataFrame([['Mark', 50], ['Kate', 46]],
                 columns=['name', 'age'])
df2 = pd.DataFrame([['Jon', 3], ['David', 4]],
                columns=['name', 'age'])

In [None]:
pd.concat([df1,df2])

In [None]:
df3 = pd.DataFrame(['writer', 'journalist'], columns=['occupation'])

In [None]:
pd.concat([df1, df3], axis=1)

Indexing, selecting and filtering
---

In [None]:
# We will work on a subset of the columns
columns = [
    'Mountain', 'Height (m)', 'Range', 'Coordinates', 'Parent mountain',
    'First ascent', 'Ascents bef. 2004', 'Failed attempts bef. 2004'
]

# Load the DataFrame, we will work on the first 10 rows (ten highest mountains)
df = pd.read_csv('Mountains.csv', nrows=10, usecols=columns)
df

In [None]:
df.set_index('Mountain', inplace=True)
df

In [None]:
df.index

In [None]:
df.columns

In [None]:
df.Range

In [None]:
getattr(df, 'Height (m)')

In [None]:
df['Height (m)']

In [None]:
df[['Height (m)', 'Range', 'Coordinates']]

In [None]:
df[2:8]

In [None]:
df['Lhotse':'Manaslu']

In [None]:
df.iloc[:, 2:6]

In [None]:
df.iloc[::2, 2:]

In [None]:
df.loc[:, 'Height (m)':'First ascent']

In [None]:
df.loc[:, 'Height (m)':'First ascent':2]

In [None]:
df['Parent mountain'] == 'Mount Everest'

In [None]:
df[df['Parent mountain'] == 'Mount Everest']

In [None]:
# Mountains with Mount Everest as parent with a first ascent after 1955
df[(df['Parent mountain'] == 'Mount Everest') & (df['First ascent'] > 1955)]

In [None]:
df.loc[(df['Parent mountain'] == 'Mount Everest') & (df['First ascent'] > 1955), :]

In [None]:
df.loc[(df['Parent mountain'] == 'Mount Everest') & (df['First ascent'] > 1955), 'Height (m)':'Range']

In [None]:
col_criteria = [True, False, False, False, True, True, False]
df.loc[df['Parent mountain'] == 'Mount Everest', col_criteria] 

Views vs copies
---

In [None]:
df= pd.DataFrame({
    'user': [1,2,3],
    'age': [24,54,17],
    'sex': ['F','F','M'],
    'occupation': ['technician','musician','student']
})
df

In [None]:
df[df.sex=='F']

In [None]:
df[df.sex=='F'].sex

In [None]:
df[df.sex=='F'].sex = 'Female'

In [None]:
df.loc[df.sex=='F','sex'] = 'Female'
df

In [None]:
df = pd.DataFrame( {
    'user': [1,2,3], 
    'age': [24,54,17], 
    'sex': ['F','F','M'], 
    'occupation': ['technician','musician','student']
})

In [None]:
df2 = df.loc[df.sex=='F']

In [None]:
df2.loc[0:1,'sex']='Female'

In [None]:
df2 = df.loc[df.sex=='F'].copy()

In [None]:
df2.loc[0:1,'sex']='Female'

Applying functions
---

In [None]:
df = pd.DataFrame( {
    'user': [1, 2, 3], 
    'age': [24, 54, 17], 
    'sex': ['F', 'F', 'M'], 
    'occupation': ['technician', 'musician', 'student']
})

In [None]:
df['sex'] = df['sex'].map({'F': 'Female', 'M': 'Male'})
df

In [None]:
df['sex'].map({'Female': 1})

In [None]:
df['sex'].replace('Female', '1')

In [None]:
df2 = pd.DataFrame(
    data=np.arange(9).reshape(3, 3), columns=['a', 'b', 'c'])

In [None]:
df2.apply(sum, axis=0)

In [None]:
df2.apply(sum, axis=1)

In [None]:
# Finds the maximum entry in each row
df2.apply(np.max, axis = 1)

In [None]:
# Find the mean of each column
df2.apply(np.mean, axis = 0)

In [None]:
def my_func(x):
    if x > 5:
        size = 'Large'
    elif x >3:
        size = 'Medium'
    else:
        size = 'Small'
    return size

In [None]:
df2.applymap(my_func)

Sorting
---

In [None]:
df = pd.DataFrame({'A':[3,6,1,12,3],'B':[0,0,7,5,6],'C':[10,4,5,8,2]})
df

In [None]:
df.sort_index()

In [None]:
df.sort_index(ascending=False)

In [None]:
df.sort_index(ascending=False, axis=1)

In [None]:
df['A'].sort_values()

In [None]:
df.sort_values('A')

In [None]:
df.sort_values(['A','C'])

Grouping
---

In [None]:
df = pd.DataFrame({
       'A' : ['dog', 'cat', 'dog', 'cat', 'dog', 'cat', 'dog', 'dog'],
       'B' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
       'C' : np.random.randint(10, size=8)})

In [None]:
df['C'].mean()

In [None]:
df.groupby('A')['C'].mean()

In [None]:
df.loc[df['A']=='dog','C'].mean()

Missing values
---

In [None]:
df = pd.DataFrame(np.random.randint(10, size=(3, 3)), index=['a', 'c', 'e'], columns=['A', 'B', 'C'])

In [None]:
df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f'])

In [None]:
df2

In [None]:
df2.isnull()

In [None]:
df2.isnull().sum()

In [None]:
df2.isnull().sum(axis=1)

In [None]:
df2[df2['A'].isnull()]

In [None]:
df2.dropna()

In [None]:
df2.fillna(value=0)