# pandas
## Python Data Analysis Library
https://pandas.pydata.org/  

Widely used library for data munging and preparation, for data analysis and modeling.

In [None]:
import numpy as np
import pandas as pd

In [None]:
from pandas import Series, DataFrame

## Series
One-dimensional array-like object containing a sequence of values, with an associated array of data labels, or index.

In [None]:
obj = pd.Series([4,7,-5,3])
obj

pandas series can be created from different data types:

In [None]:
# dict of WWCode members:
sdata = {'Singapore': 1305, 
         'Kuala Lumpur': 424, 
         'Manila': 1162, 
         'Taipei': 789}
obj3 = pd.Series(sdata)
obj3

In [None]:
# override order of cities:
countries = ['Kuala Lumpur', 'Manila', 'Singapore', 'Sydney']
obj4 = pd.Series(sdata, index=countries)
obj4

## NaN
### Not a Number
is used to mark missing or NA values.

In [None]:
pd.isnull(obj4)

In [None]:
obj4.isnull()

### Operations automatically aigns by index labels

In [None]:
obj3+obj4

## DataFrame
Table of data, similar to DataFrames in R.

In [None]:
data = {'city':['Kuala Lumpur', 'Kuala Lumpur', 'Kuala Lumpur', 
                'Singapore', 'Singapore', 'Singapore'],
        'year':[2016, 2015, 2014, 2016, 2015, 2014],
        'pop':[1.79, 1.78, 1.74, 5.607, 5.535, 5.47]}
df = pd.DataFrame(data)
df

In [None]:
df.head()

### Columns and Indexing

In [None]:
df2 = pd.DataFrame(data, columns=['year','city','pop', 'unemployment'], 
                   index = ['one','two','three','four','five','six'])
df2

In [None]:
# retrieve column as series
df['city']

In [None]:
df2.year

In [None]:
type(df['pop'])

### Modify Data Frame values

In [None]:
df2.loc['three']

Generate new column

In [None]:
df2['gdp'] = 0
df2

In [None]:
df2['gdp'] = np.arange(6.)
df2

 * Assigning a column that doesn't exist will create a new column
 * The **del** keyword will delete a column 

In [None]:
df2.columns

### Transpose

In [None]:
df2.T

In [None]:
# set index and columns name attribute
df2.index.name = 'year'; df2.columns.name = 'city'
df2

## reindex

In [None]:
obj = pd.Series([4.5,7.2,-5.3,3.6], index=['d','b','a','c'])
obj

In [None]:
obj2 = obj.reindex(['a','b','c','d','e'])
obj2

In [None]:
# DataFrames
df = pd.DataFrame(np.arange(9).reshape((3,3)),
                  index=['a','c','d'],
                  columns=['Orchard','Somerset','Dhoby Ghaut'])
df

In [None]:
stations = ['China Town','Clarke Quay','Dhoby Ghaut']
df.reindex(columns=stations)

## drop

In [None]:
df.drop('a')

In [None]:
df.drop(['Orchard','Somerset'], axis = 1)

In [None]:
df.drop('c', inplace=True)
df

## Summarizing and Computing Descriptive Statistics

In [None]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]], index=['a','b','c','d'], columns=['one','two'])
df

In [None]:
df.sum()

In [None]:
df.mean(axis='columns',skipna=False)

In [None]:
df.idxmax()

In [None]:
df.cumsum()

In [None]:
df.describe()

## Removing duplicates

In [None]:
df = df.append(df.cumsum())
df

In [None]:
df.duplicated()

In [None]:
df.drop_duplicates()