In [2]:
import numpy as np
import pandas as pd

In [6]:
labels = ['a','b','c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10,'b':20,'c':30}

In [8]:
pd.Series(data=my_data)

0    10
1    20
2    30
dtype: int64

In [9]:
pd.Series(data=my_data,index=labels)

a    10
b    20
c    30
dtype: int64

In [11]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int64

In [13]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [16]:
ser1 = pd.Series([1,2,3,4],['USA','Germany','USSR','Japan'])

In [18]:
ser1['USSR']

3

### DataFrame

In [20]:
from numpy.random import randn
np.random.seed(101)

In [22]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [25]:
df['W']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [28]:
df[['W','X']]

Unnamed: 0,W,X
A,0.302665,1.693723
B,-0.134841,0.390528
C,0.807706,0.07296
D,-0.497104,-0.75407
E,-0.116773,1.901755


In [30]:
df['new'] = df['W'] + df['X']

In [33]:
df.drop('new', axis=1, inplace=True)

ValueError: labels ['new'] not contained in axis

In [None]:
# Rows

In [35]:
df.loc['A']

W    0.302665
X    1.693723
Y   -1.706086
Z   -1.159119
Name: A, dtype: float64

In [38]:
df.iloc[2]

W    0.807706
X    0.072960
Y    0.638787
Z    0.329646
Name: C, dtype: float64

In [40]:
df.loc['B','Y']

0.16690463609281317

In [42]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,0.302665,-1.706086
B,-0.134841,0.166905


In [45]:
df[df > 0]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,,
B,,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,,,,0.484752
E,,1.901755,0.238127,1.996652


In [48]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
C,0.807706,0.07296,0.638787,0.329646


In [51]:
# all rows where Z is less than 0
df[df['Z'] < 0][['Y', 'Z']]

Unnamed: 0,Y,Z
A,-1.706086,-1.159119


In [54]:
# multiple conditions
df[(df['W'] > 0) & (df['Z'] < 0)]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119


In [56]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [61]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,W,X,Y,Z
0,0.302665,1.693723,-1.706086,-1.159119
1,-0.134841,0.390528,0.166905,0.184502
2,0.807706,0.07296,0.638787,0.329646
3,-0.497104,-0.75407,-0.943406,0.484752
4,-0.116773,1.901755,0.238127,1.996652


In [66]:
newid = 'CA NY WY OR CO'.split()

In [68]:
df['States'] = newid

In [70]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.302665,1.693723,-1.706086,-1.159119
NY,-0.134841,0.390528,0.166905,0.184502
WY,0.807706,0.07296,0.638787,0.329646
OR,-0.497104,-0.75407,-0.943406,0.484752
CO,-0.116773,1.901755,0.238127,1.996652


In [77]:
# multi-index
outside = 'G1 G1 G1 G2 G2 G2 G2'.split()
inside = [1,2,3,1,2,3]

hier_index = list(zip(outside, inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [81]:
df = pd.DataFrame(randn(6,2), hier_index, ['A', 'B'])

In [87]:
df.loc['G1'].loc[1:2, 'A']

1    0.992573
2   -1.046780
Name: A, dtype: float64

In [89]:
#### MISSING DATA

In [92]:
d = {'A': [1, 2, np.nan],
     'B': [5, np.nan, np.nan],
     'C': [1, 2, 3]}
df = pd.DataFrame(d)

In [95]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [97]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [100]:
df.fillna(value='FILL VALUE')

Unnamed: 0,A,B,C
0,1,5,1
1,2,FILL VALUE,2
2,FILL VALUE,FILL VALUE,3


In [102]:
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

In [103]:
#### GROUPBY

In [108]:
d = {'Company': ['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB'],
     'Person': ['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'],
     'Sales': [200, 120, 340, 124, 243, 350]}
df = pd.DataFrame(d)

In [117]:
df.groupby(['Company']).max().loc['FB']

Person    Sarah
Sales       350
Name: FB, dtype: object

In [119]:
#### Operations

In [121]:
df = pd.DataFrame({'col1': [1, 2, 3, 4],
                   'col2': [444, 555, 666, 444],
                   'col3': ['abc', 'def', 'ghi', 'xyz']})

In [127]:
len(df['col2'].unique())
df['col2'].nunique()


3

In [129]:
df['col2'].value_counts()

444    2
555    1
666    1
Name: col2, dtype: int64

In [132]:
times2 = lambda x: x*2
df['col1'].apply(times2)

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

In [139]:
df['col3'].apply(len)

0    3
1    3
2    3
3    3
Name: col3, dtype: int64

In [146]:
df.sort_values(by='col2')

Unnamed: 0,col1,col2,col3
0,1,444,abc
3,4,444,xyz
1,2,555,def
2,3,666,ghi


In [148]:
df.isnull()

Unnamed: 0,col1,col2,col3
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


In [149]:
#### Data Input and Output

In [151]:
pd.read_???

SyntaxError: invalid syntax (<ipython-input-151-969c2a7062d8>, line 1)

In [None]:
df.to_???