# Operations

In [67]:
import pandas as pd
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [68]:
df.head(n = 3) # show the first 3 rows only, default value n = 5

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi


In [69]:
df.head()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [70]:
df.tail(2)

Unnamed: 0,col1,col2,col3
2,3,666,ghi
3,4,444,xyz


### Info on Unique Values

In [71]:
df['col2'].unique()

array([444, 555, 666])

In [72]:
df['col2'].nunique() # number of unique values

3

In [73]:
df['col2'].value_counts()

444    2
555    1
666    1
Name: col2, dtype: int64

### Selecting Data

In [74]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [75]:
#Select from DataFrame using criteria from multiple columns
newdf = df[(df['col1']>2) & (df['col2']==444)]

In [76]:
newdf

Unnamed: 0,col1,col2,col3
3,4,444,xyz


### Applying Functions

In [77]:
def times2(x):
    return x*2

In [78]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [79]:
df['col1'].apply(times2)

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

In [80]:
df['col3'].apply(len)

0    3
1    3
2    3
3    3
Name: col3, dtype: int64

In [81]:
df['col2'].sum()

2109

In [82]:
df['col2'].mean()

527.25

In [85]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [84]:
df['col1'].diff() # difference between current and previous rows

0    NaN
1    1.0
2    1.0
3    1.0
Name: col1, dtype: float64

In [86]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [87]:
df['col2'].diff(-1) # difference between current and next rows

0   -111.0
1   -111.0
2    222.0
3      NaN
Name: col2, dtype: float64

In [88]:
import numpy as np

np.exp(df['col1'])

0     2.718282
1     7.389056
2    20.085537
3    54.598150
Name: col1, dtype: float64

In [89]:
np.mean(df['col1'])

2.5

In [91]:
df['col1'].mean()

2.5

In [90]:
np.sin(df['col2'])

0   -0.860085
1    0.873283
2   -0.017642
3   -0.860085
Name: col2, dtype: float64

In [93]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [92]:
df['col2'].cumsum() # cummulative sum

0     444
1     999
2    1665
3    2109
Name: col2, dtype: int64

In [94]:
df['col1'].cumprod() # cummulative product

0     1
1     2
2     6
3    24
Name: col1, dtype: int64

**Permanently Removing a Column**

In [98]:
del df['col1']

In [99]:
df

Unnamed: 0,col2,col3
0,444,abc
1,555,def
2,666,ghi
3,444,xyz


**Get column and index names:**

In [100]:
df.columns

Index(['col2', 'col3'], dtype='object')

In [101]:
df.index

RangeIndex(start=0, stop=4, step=1)

**Sorting and Ordering a DataFrame:**

In [102]:
df

Unnamed: 0,col2,col3
0,444,abc
1,555,def
2,666,ghi
3,444,xyz


In [103]:
df.sort_values(by='col2', inplace = True)
df

Unnamed: 0,col2,col3
0,444,abc
3,444,xyz
1,555,def
2,666,ghi


**Find Null Values or Check for Null Values**

In [104]:
df.isnull()

Unnamed: 0,col2,col3
0,False,False
3,False,False
1,False,False
2,False,False


**Filling in NaN values with something else:**

In [105]:
import numpy as np

In [106]:
df = pd.DataFrame({'col1':[1,2,3,np.nan],
                   'col2':[np.nan,555,666,444],
                   'col3':['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1.0,,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,,444.0,xyz


In [107]:
# Drop rows with NaN Values
df.dropna()

Unnamed: 0,col1,col2,col3
1,2.0,555.0,def
2,3.0,666.0,ghi


In [109]:
df.dropna(axis = 1)

Unnamed: 0,col3
0,abc
1,def
2,ghi
3,xyz


In [110]:
df

Unnamed: 0,col1,col2,col3
0,1.0,,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,,444.0,xyz


In [112]:
df.fillna(111)

Unnamed: 0,col1,col2,col3
0,1.0,111.0,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,111.0,444.0,xyz


In [113]:
df

Unnamed: 0,col1,col2,col3
0,1.0,,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,,444.0,xyz


In [114]:
values = {'col1':4, 'col2': 111}
df.fillna(values, inplace = True)

In [115]:
df

Unnamed: 0,col1,col2,col3
0,1.0,111.0,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,4.0,444.0,xyz
