In [37]:
import pandas as pd
import numpy as np

In [27]:
## Exporting to CSV

my_dict = { 'name' : ["a", "b", "c", "d", "e","f", "g"],
            'age' : [20,27, 35, 55, 18, 21, 35],
            'designation': ["VP", "CEO", "CFO", "VP", "VP", "CEO", "MD"]}

df = pd.DataFrame(my_dict)
df.to_csv('data/scratch_example.csv')

In [28]:
# Reading from csv
pd_df = pd.read_csv('data/scratch_example.csv')
pd_df

Unnamed: 0.1,Unnamed: 0,name,age,designation
0,0,a,20,VP
1,1,b,27,CEO
2,2,c,35,CFO
3,3,d,55,VP
4,4,e,18,VP
5,5,f,21,CEO
6,6,g,35,MD


In [29]:
df.to_csv("data/scratch_example.csv", index=False)
pd_df = pd.read_csv('data/scratch_example.csv')
pd_df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


#### .......BOOM BABY

In [30]:
# customizing headers
pd_df = pd.read_csv('data/scratch_example.csv',header=[3])
pd_df

Unnamed: 0,c,35,CFO
0,d,55,VP
1,e,18,VP
2,f,21,CEO
3,g,35,MD


In [31]:
# custom column names
pd_df = pd.read_csv('data/scratch_example.csv',names =['a','b','c','d'], header=1)
pd_df

ParserError: Too many columns specified: expected 4 and found 3

In [32]:
# write with custom separator
pd_df.to_csv('data/scratch_example.csv',index=False,sep=':')
pd_df = pd.read_csv('data/scratch_example.csv')
pd_df

Unnamed: 0,c:35:CFO
0,d:55:VP
1,e:18:VP
2,f:21:CEO
3,g:35:MD


In [33]:
pd_df = pd.read_csv('data/scratch_example.csv',sep=':')
pd_df

Unnamed: 0,c,35,CFO
0,d,55,VP
1,e,18,VP
2,f,21,CEO
3,g,35,MD


In [34]:
# if you need to count blank lines in a csv file...
pd_df = pd.read_csv('data/scratch_example.csv',sep=':', skip_blank_lines=False)


## Pandas Reshaping

In [35]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                     'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                     'one', 'two', 'one', 'two']]))

In [38]:
index = pd.MultiIndex.from_tuples(tuples,names = ['first','second'])
df = pd.DataFrame(np.random.randn(8,2),index=index, columns=['A','B'])
df2 = df[:4]

In [42]:
stacked = df2.stack()
stacked

first  second   
bar    one     A   -0.037593
               B    0.155955
       two     A    0.406719
               B   -1.539404
baz    one     A    0.747569
               B   -0.511952
       two     A   -0.640205
               B    0.850259
dtype: float64

In [43]:
type(stacked)

pandas.core.series.Series

In [53]:
print(stacked.unstack())
print()
print(stacked.unstack(1))
print()
print(stacked.unstack(0))


                     A         B
first second                    
bar   one    -0.037593  0.155955
      two     0.406719 -1.539404
baz   one     0.747569 -0.511952
      two    -0.640205  0.850259

second        one       two
first                      
bar   A -0.037593  0.406719
      B  0.155955 -1.539404
baz   A  0.747569 -0.640205
      B -0.511952  0.850259

first          bar       baz
second                      
one    A -0.037593  0.747569
       B  0.155955 -0.511952
two    A  0.406719 -0.640205
       B -1.539404  0.850259


## Pivot Tables

In [54]:
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
                   'B': ['A', 'B', 'C'] * 4,
                   'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D': np.random.randn(12),
                   'E': np.random.randn(12)})

In [55]:
pd.pivot_table(df,values='D',index=['A','B'],columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,1.376531,-0.13609
one,B,-1.272886,1.180911
one,C,-1.196007,0.557654
three,A,0.513894,
three,B,,1.401399
three,C,1.968757,
two,A,,0.265676
two,B,1.15191,
two,C,,-1.485631


Check out [this link](https://www.lumeer.io/pivot-table-complete-guide/) for more on pivot tables!

## Pandas Apply Functions!

In [57]:
def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    df["city_name"] = df["city_and_code"].str.split(",").str.get(0)
    return df

def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    col = 'city_name'
    df['city_and_country'] = df[col] + country_name
    return df


df_p = pd.DataFrame ({"city_and_code": ["Chicago, IL"]})

add_country_name(extract_city_name(df_p),country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


For the situation above, ```pandas``` encourages us to use the function ```pipe()```, as seen below.

In [60]:
df_p.pipe(extract_city_name)\
    .pipe(add_country_name, country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [61]:
df = pd.DataFrame({
    'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
    'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
    'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

In [62]:
df.apply(np.mean)

one     -0.021826
two     -0.374968
three    0.891083
dtype: float64

In [63]:
df.apply(np.mean,axis=1)

a    1.399362
b   -0.743216
c    0.065549
d    0.171086
dtype: float64

In [64]:
df.apply(lambda x: x.max() - x.min())

one      1.467775
two      4.180257
three    0.618136
dtype: float64

In [65]:
df.apply(np.cumsum)

Unnamed: 0,one,two,three
a,0.699307,2.099418,
b,-0.069162,0.01858,0.619659
c,-0.065479,-0.604249,1.435454
d,,-1.499871,2.673248


In [66]:
 df.apply(np.exp)

Unnamed: 0,one,two,three
a,2.012357,8.16142,
b,0.463723,0.124825,1.858295
c,1.003689,0.536425,2.260971
d,,0.408353,3.448002


In [68]:
def own_function(x):
        return x*x

def subtract_and_divide(x, sub, divide=1):
    return (x - sub) / divide

df.apply(subtract_and_divide, args=(5,3))

Unnamed: 0,one,two,three
a,-1.433564,-0.966861,
b,-1.922823,-2.36028,-1.460114
c,-1.665439,-1.874276,-1.394735
d,,-1.965207,-1.254068


In [69]:
def subtract(x, sub):
    return (x - sub)

df.apply(subtract, args=(5,))

Unnamed: 0,one,two,three
a,-4.300693,-2.900582,
b,-5.768468,-7.080839,-4.380341
c,-4.996317,-5.622829,-4.184206
d,,-5.895622,-3.762205
