In [72]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
titanic_train = pd.read_csv("https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv",
                           sep='\t')   
titanic_train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#last 5 rows
titanic_train.tail(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
151,152,1,1,"Pears, Mrs. Thomas (Edith Wearne)",female,22.0,1,0,113776,66.6,C2,S
152,153,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,A.5. 11206,8.05,,S
153,154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S
154,155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S
155,156,0,1,"Williams, Mr. Charles Duane",male,51.0,0,1,PC 17597,61.3792,,C


In [4]:
#data types: dtypes
titanic_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [19]:
#basic analysis using describe() function: it works only on numerical data
titanic_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,156.0,156.0,156.0,126.0,156.0,156.0,156.0
mean,78.5,0.346154,2.423077,28.141508,0.615385,0.397436,28.109587
std,45.177428,0.477275,0.795459,14.61388,1.056235,0.870146,39.401047
min,1.0,0.0,1.0,0.83,0.0,0.0,6.75
25%,39.75,0.0,2.0,19.0,0.0,0.0,8.00315
50%,78.5,0.0,3.0,26.0,0.0,0.0,14.4542
75%,117.25,1.0,3.0,35.0,1.0,0.0,30.37185
max,156.0,1.0,3.0,71.0,5.0,5.0,263.0


In [3]:
#describe function on a categorical variable

#to run on single variable
titanic_train['Name'].describe()

count                                    156
unique                                   156
top       Vander Planke, Miss. Augusta Maria
freq                                       1
Name: Name, dtype: object

In [4]:
'''
describe function on a categorical variable
To run on all categorical variables we follow below steps.
As seen below, Cabin has lots of missing values - since count=31.
As seen below, Ticket has lots of duplciate values - since count=156 and unique=145.

In below, freq corresponds to freq of 'top' value

'''
categorical_flag = titanic_train.dtypes == "object"
categorical_col_list = titanic_train.dtypes[categorical_flag].index
titanic_train[categorical_col_list].describe()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,156,156,156,31,155
unique,156,2,145,28,3
top,"Vander Planke, Miss. Augusta Maria",male,19950,C123,S
freq,1,100,2,2,110


In [6]:
#sub filtering on dtypes: list all object dtypes
titanic_train.dtypes[titanic_train.dtypes == "object"]

Name        object
Sex         object
Ticket      object
Cabin       object
Embarked    object
dtype: object

In [8]:
'''
Q: list only those columns whose dtype == object.
'''
#step 1: decide condition to filter dtype == object
condition = titanic_train.dtypes == "object"

#step 2: Select only those column who meet aboe condition
col_list = titanic_train.dtypes[condition].index

#step 3: select relevant data 
titanic_train[col_list].head(5)



Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [9]:
#select specific columns : use double brackets [[]]
titanic_train[['Name','Sex','Ticket','Cabin','Embarked']].head(5)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [6]:
'''
Sorting Data: sorted() function - 
    It takes Series Object - it works only on Series Object.
    it returns a List object.
If more than one col pass, incorrect or no output recieved.
'''
sorted(titanic_train['Name'])[5:10:2]

['Andersson, Mr. August Edvard ("Wennerstrom")',
 'Andrew, Mr. Edgardo Samuel',
 'Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)']

In [27]:
#add new columns to data frame : Kushal : to do to-do
titanic_train['temp_col'] = titanic_train.apply(lambda x: 'Mrs' if 'mrs' in titanic_train['name'].lower() else 'Mr' if 'mr' in titanic_train['name'].lower() else 'Miss')

KeyError: ('name', u'occurred at index PassengerId')

In [21]:
titanic_train.columns

Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'],
      dtype='object')

In [5]:
'''
Delete a column -
1. del df['col_name']
2. df.drop("col_name",inplace=True,axis=1) # Notice the axis=1 option, axis = 0 is default, so one has to change it to 1
drop() function allows to delete based on row index (axis=0) and col name (axis=1).
Default axis=0. So, to drop a column, set axis=1.
inplace = True allows us to drop from DataFrame. Else it only returns a dataframe with dropped value, and no actual delete.

df = df.drop('col_name', inplace=True, axis =1)

'''

del titanic_train['PassengerId']

In [10]:
'''

Handling Categorical Data. 
Suppose we want to replace categorical value 0,1 in column "Survived" with proper description - 'died','survived'

rename_categories - Here pandas will map index-wise values 0 and 1 to Died and Survived when applying rename_categories() function.

'''
new_survived = pd.Categorical(titanic_train['Survived'])
print new_survived
print "----"
new_survived = new_survived.rename_categories(['Died','Survived'])
print new_survived


[0, 1, 1, 1, 0, ..., 1, 0, 0, 0, 0]
Length: 156
Categories (2, int64): [0, 1]
----
[Died, Survived, Survived, Survived, Died, ..., Survived, Died, Died, Died, Died]
Length: 156
Categories (2, object): [Died, Survived]


In [12]:
new_class = pd.Categorical(titanic_train['Pclass'])
print new_class
print "-----"
new_class = new_class.rename_categories(['Class1','Class2','Class3'])
print new_class

[3, 1, 3, 1, 3, ..., 1, 3, 3, 3, 1]
Length: 156
Categories (3, int64): [1, 2, 3]
-----
[Class3, Class1, Class3, Class1, Class3, ..., Class1, Class3, Class3, Class3, Class1]
Length: 156
Categories (3, object): [Class1, Class2, Class3]


In [18]:
char_cabin = titanic_train['Cabin'].astype(str)              #convert to string
char_cabin.head(5)

new_cabin = np.array([ cabin[0] for cabin in char_cabin ])   #take only 1st character. Convert to NUMPY ARRAY
new_cabin = pd.Categorical(new_cabin)                       
new_cabin.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2,0.012821
B,5,0.032051
C,10,0.064103
D,6,0.038462
E,3,0.019231
F,4,0.025641
G,1,0.00641
n,125,0.801282


## Pandas Series

In [20]:
'''
List = Python 
Series = Pandas
'''

dummy_vector = pd.Series([1,None,3,None,7,8])
dummy_vector.isnull()

0    False
1     True
2    False
3     True
4    False
5    False
dtype: bool

In [22]:
'''
Filter INDEXES with missing values of column 'Age'

np.where() - it returns INDEXES
'''
missing = np.where(titanic_train['Age'].isnull()==True)
missing

(array([  5,  17,  19,  26,  28,  29,  31,  32,  36,  42,  45,  46,  47,
         48,  55,  64,  65,  76,  77,  82,  87,  95, 101, 107, 109, 121,
        126, 128, 140, 154]),)

In [23]:
'''
To extract data at ROW LEVEL we have 3 functions loc(),iloc(),and ix().
loc() - it takes user definedINDEXES as input to extract data for only those indexes.
iloc() - it takes integer indexes for selecting ROWS. df.iloc[0:3]
ix() - same as loc() - But is a deprecated version - so AVOID using it.
'''
index =  np.where(titanic_train['Fare'] == max(titanic_train['Fare']) )
titanic_train.loc[index]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
27,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S
88,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,C23 C25 C27,S


In [53]:
titanic_train.iloc[0:3]   #select top 3 ROWS using row index 0:3

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [57]:
from numpy.random import randn as rn
matrix_data = rn(5,4)  # this creates a 5x4 matrix with random values
print matrix_data

row_labels = ['A','B','C','D','E']
column_headings = ['W','X','Y','Z']

df = pd.DataFrame(matrix_data,row_labels,column_headings)

df

[[ 0.08577373  0.01699737 -0.68799801 -0.18329573]
 [-0.37644433  0.55598843  0.27039425 -0.78702187]
 [-0.66691833 -0.11009185  0.55638576 -0.3312932 ]
 [-1.80833305  0.58512243  0.32152192 -0.11277189]
 [-0.00225257 -0.58624766 -0.65417664 -1.69559056]]


Unnamed: 0,W,X,Y,Z
A,0.085774,0.016997,-0.687998,-0.183296
B,-0.376444,0.555988,0.270394,-0.787022
C,-0.666918,-0.110092,0.556386,-0.331293
D,-1.808333,0.585122,0.321522,-0.112772
E,-0.002253,-0.586248,-0.654177,-1.695591


In [58]:
#selection along specific rows and columns using loc() function
df.loc[ ('B','D'), ('W','Y')]

Unnamed: 0,W,Y
B,-0.376444,0.270394
D,-1.808333,0.321522


In [73]:
df.iloc[ [1,3],  [0,3] ]

Unnamed: 0,W,Z
B,-0.376444,-0.787022
D,-1.808333,-0.112772


In [55]:
'''
Convert numpy arrays to pandas series or dataframes
'''

labels = ['a', 'b', 'c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10,'b':20,'c':30}

print labels
print my_data
print d

pd.Series(data = my_data, index = labels)

['a', 'b', 'c']
[10, 20, 30]
{'a': 10, 'c': 30, 'b': 20}


a    10
b    20
c    30
dtype: int64

In [28]:
print "\nHolding numerical data\n"
print pd.Series(arr)[1]


Holding numerical data

20


In [29]:
ser1 = pd.Series([1,2,3,4],index = [2,4,6,8])
ser2 = pd.Series([1,2,5,4],['CA', 'OR', 'NV', 'AZ'])
print ser1 
print ser2

2    1
4    2
6    3
8    4
dtype: int64
CA    1
OR    2
NV    5
AZ    4
dtype: int64


In [30]:
#creating data frame
pd.DataFrame(my_data)

Unnamed: 0,0
0,10
1,20
2,30


In [31]:
#creating data frame
pd.Series(my_data)

0    10
1    20
2    30
dtype: int64

In [32]:
'''
We can also apply Series to a python built-in functions
'''
print(pd.Series([type, sum, max]))

0              <type 'type'>
1    <built-in function sum>
2    <built-in function max>
dtype: object


In [33]:
ser1 = pd.Series([1,2,3,4],['CA', 'OR', 'CO', 'CA'])
ser2 = pd.Series([1,2,5,4],['CA', 'NV', 'AZ','OR'])
ser3 = ser1+ser2    #where no common indexes, output = NaN value.
ser3

AZ    NaN
CA    2.0
CA    5.0
CO    NaN
NV    NaN
OR    6.0
dtype: float64

In [34]:
ser3 = np.exp(ser1) + np.log10(ser2)
ser3

AZ          NaN
CA     2.718282
CA    54.598150
CO          NaN
NV          NaN
OR     7.991116
dtype: float64

In [38]:
ser1 = pd.Series([1,2,3,4],['CA', 'OR', 'CA', 'CA'])   #duplicate indexes 'CA'
ser2 = pd.Series([1,2,5,4],['CA', 'NV', 'AZ','OR'])
ser3 = ser1+ser2    #where no common indexes, output = NaN value.
ser3

AZ    NaN
CA    2.0
CA    4.0
CA    5.0
NV    NaN
OR    6.0
dtype: float64

In [41]:
'''
create a matrix using randn package in numpy
'''

from numpy.random import randn as rn
matrix_data = rn(5,4)  # this creates a 5x4 matrix with random values
print matrix_data

row_labels = ['A','B','C','D','E']
column_headings = ['W','X','Y','Z']

df = pd.DataFrame(matrix_data,row_labels,column_headings)

df

[[ 1.91368102  1.69869475  0.63906952 -0.46725589]
 [ 0.24441838  1.04285966  0.17548955 -0.19115052]
 [ 1.16413067  0.96595453 -0.05389812  1.23080251]
 [ 0.51898505  1.2756413   0.03123373  0.14952518]
 [ 0.13369552 -0.43991973  0.21314647 -1.31332602]]


Unnamed: 0,W,X,Y,Z
A,1.913681,1.698695,0.63907,-0.467256
B,0.244418,1.04286,0.17549,-0.191151
C,1.164131,0.965955,-0.053898,1.230803
D,0.518985,1.275641,0.031234,0.149525
E,0.133696,-0.43992,0.213146,-1.313326


In [49]:
# df.loc['B']
df.iloc[0:3] #select columns based on Indexes

Unnamed: 0,W,X,Y,Z
A,1.913681,1.698695,0.63907,-0.467256
B,0.244418,1.04286,0.17549,-0.191151
C,1.164131,0.965955,-0.053898,1.230803


In [66]:
df>0

Unnamed: 0,W,X,Y,Z
A,True,True,False,False
B,False,True,True,False
C,False,False,True,False
D,False,True,True,False
E,False,False,False,False


In [74]:
df.loc[['B','D']]>0

Unnamed: 0,W,X,Y,Z
B,False,True,True,False
D,False,True,True,False


In [75]:
#select only positive values
df[df>0]

Unnamed: 0,W,X,Y,Z
A,0.085774,0.016997,,
B,,0.555988,0.270394,
C,,,0.556386,
D,,0.585122,0.321522,
E,,,,


In [79]:
'''
Numpy Matrix
'''
matrix_data = np.matrix('22,66,140;42,70,148;30,62,125;35,68,160;25,62,152')
row_labels = ['A','B','C','D','E']
column_headings = ['Age', 'Height', 'Weight']
print matrix_data

df = pd.DataFrame(data=matrix_data, index=row_labels, columns=column_headings)
print df

[[ 22  66 140]
 [ 42  70 148]
 [ 30  62 125]
 [ 35  68 160]
 [ 25  62 152]]
   Age  Height  Weight
A   22      66     140
B   42      70     148
C   30      62     125
D   35      68     160
E   25      62     152


In [80]:
df[df['Height']>65]

Unnamed: 0,Age,Height,Weight
A,22,66,140
B,42,70,148
D,35,68,160


In [81]:
#combining 2 conditions
cond1 = df['Height']>65
cond2 = df['Weight']>145

df[cond1 & cond2]

Unnamed: 0,Age,Height,Weight
B,42,70,148
D,35,68,160


In [96]:
#reset index. It wont drop original index, and keep as column with name 'index'. 
# To drop index after reset, use option drop=True
df2 = df.reset_index()
print df2.columns
df2

Index([u'Age', u'Height', u'Weight'], dtype='object')


Unnamed: 0,Age,Height,Weight
0,22,66,140
1,42,70,148
2,30,62,125
3,35,68,160
4,25,62,152


In [97]:
# Set index
df2['Profession'] = "Student Teacher Engineer Doctor Nurse".split()
df2.set_index('Profession')

Unnamed: 0_level_0,Age,Height,Weight
Profession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Student,22,66,140
Teacher,42,70,148
Engineer,30,62,125
Doctor,35,68,160
Nurse,25,62,152


In [98]:
#multi-indexing - useful for multilevel data like JSON data.
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
print(hier_index)

'''
Here codes imply method to access values at different level when applied to dataframe.
eg - (G1,1) = (0,0), (G2,2) = (1,1)
'''
hier_index = pd.MultiIndex.from_tuples(hier_index)
print(hier_index)

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]
MultiIndex(levels=[[u'G1', u'G2'], [1, 2, 3]],
           codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])


In [110]:
# df1 = pd.DataFrame(data=np.round(rn(6,3)), index= hier_index, columns= ['A','B','C'])
df1 = pd.DataFrame(data=rn(6,3), index= hier_index, columns= ['A','B','C'])
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
Outer,Inner,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
G1,1,0.730322,-1.339815,-0.634968
G1,2,1.442215,1.118406,0.588516
G1,3,-2.58016,-2.012994,0.457762
G2,1,1.532634,0.76066,0.256329
G2,2,-0.99602,-0.429843,0.319275
G2,3,-1.119606,1.386931,-0.073815


In [111]:
#we can give names to indexes in a DataFrame.
df1.index.names=['Outer','Inner']
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
Outer,Inner,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
G1,1,0.730322,-1.339815,-0.634968
G1,2,1.442215,1.118406,0.588516
G1,3,-2.58016,-2.012994,0.457762
G2,1,1.532634,0.76066,0.256329
G2,2,-0.99602,-0.429843,0.319275
G2,3,-1.119606,1.386931,-0.073815


In [114]:
#xs() - to get cross-section from Outer Level. If you see below, index 'G1' is not printed, but values are fetched for it.
df1.xs('G1')

Unnamed: 0_level_0,A,B,C
Inner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.730322,-1.339815,-0.634968
2,1.442215,1.118406,0.588516
3,-2.58016,-2.012994,0.457762


In [113]:
df1.xs(2,level='Inner') #level Inner not printed, but values fetched.

Unnamed: 0_level_0,A,B,C
Outer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1.442215,1.118406,0.588516
G2,-0.99602,-0.429843,0.319275


In [122]:
df1.xs(2,level='Inner')

Unnamed: 0_level_0,A,B,C
Outer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1.442215,1.118406,0.588516
G2,-0.99602,-0.429843,0.319275


In [None]:
'''
modin package = distributed processing in pandas done using this package.
    import modin.pandas as pd
'''

# Pandas Class 3: Nov 9

In [3]:
import pandas as pd
import numpy as np
import os
%matplotlib inline

In [4]:
df = pd.DataFrame({'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]})
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [5]:
df['States']="CA NV AZ".split()
df.set_index('States',inplace=True)
print(df)

          A    B  C
States             
CA      1.0  5.0  1
NV      2.0  NaN  2
AZ      NaN  NaN  3


In [6]:
'''
Dropping a Value.
'''
print "original df"
print df
print 
print "dropna with axis=0 -- implies drop rows with any NaN values"
print(df.dropna(axis=0))

print 
print "dropna with axis=0 and thresh=2 -- implies retain rows if they have 2 or more NON-NaN values"
print(df.dropna(axis=0,thresh=2))

print 
print "dropna with axis=1 -- implies drop Columns with any NaN values"
print(df.dropna(axis=1))



original df
          A    B  C
States             
CA      1.0  5.0  1
NV      2.0  NaN  2
AZ      NaN  NaN  3

dropna with axis=0 -- implies drop rows with any NaN values
          A    B  C
States             
CA      1.0  5.0  1

dropna with axis=0 and thresh=2 -- implies retain rows if they have 2 or more NON-NaN values
          A    B  C
States             
CA      1.0  5.0  1
NV      2.0  NaN  2

dropna with axis=1 -- implies drop Columns with any NaN values
        C
States   
CA      1
NV      2
AZ      3


In [7]:
'''
fillna()
'''
print df.fillna(value= df['A'].mean())

          A    B  C
States             
CA      1.0  5.0  1
NV      2.0  1.5  2
AZ      1.5  1.5  3


### GroupBy in Pandas

In [8]:
# Create dataframe
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}
df = pd.DataFrame(data)
df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


In [13]:
'''
    groupBy() returns a group by dataframe object.
    It will show result only when we apply aggregate function to it like sum, count, mean, std etc.
    When we apply aggregate function, it performs aggregation only on numeric columns, and skips categorical ones.
'''
df_groupBy_company = df.groupby('Company')
print df_groupBy_company
df_groupBy_company.sum()

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb8791ef150>


Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,593
GOOG,320
MSFT,464


In [21]:
'''
count function runs on categorical data as well as numerical data.
'''
df_groupBy_company.count()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,2
GOOG,2,2
MSFT,2,2


In [14]:
# running describe() function on group by dataframe.
df_groupBy_company.describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0
GOOG,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
MSFT,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0


In [17]:
#running loc() on groupbby describe() function 
df2 = df_groupBy_company.describe().loc['FB']
df2

Sales  count      2.000000
       mean     296.500000
       std       75.660426
       min      243.000000
       25%      269.750000
       50%      296.500000
       75%      323.250000
       max      350.000000
Name: FB, dtype: float64

In [20]:
print type(df2)

#now convert above pandas series object df2 into a pandas dataframe and do Transpose()
pd.DataFrame(df2).transpose()

<class 'pandas.core.series.Series'>


Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
FB,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0


#### Merging two data frames

In [23]:
# Creating data frames
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [25]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7])
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [26]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8,9,10,11])
df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


In [27]:
'''
Merge using concatenate()
'''

df_cat1 = pd.concat([df1,df2,df3], axis=0)  #axis=0 implies merge along rows.
df_cat1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [33]:
#we create a new dataframe df4 which has values of df2 but index of df1.
df4 = df2.set_index(df1.index)
df4

Unnamed: 0,A,B,C,D
0,A4,B4,C4,D4
1,A5,B5,C5,D5
2,A6,B6,C6,D6
3,A7,B7,C7,D7


In [34]:
'''
Concat data frames with same indexes
'''
df_cat2 =  pd.concat( [df1,df4,df3], axis=0)
df_cat2   #as seen below, duplicate indexes available, but internally different posn allocation when using iloc[]

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
0,A4,B4,C4,D4
1,A5,B5,C5,D5
2,A6,B6,C6,D6
3,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [35]:
df_cat2.loc[0]   #if one or more row has same row index, all matching values printed using loc[]

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
0,A4,B4,C4,D4


In [37]:
df_cat2.iloc[4]  #positional index is unique for each row, and is handled by pandas internally.

A    A4
B    B4
C    C4
D    D4
Name: 0, dtype: object

In [47]:
'''
If we want to ignore indexes of Dataframe and let pandas set its own indexes
'''
df5 = df3.set_index(df1.index)
print df5
df_cat5 = pd.concat( [df1,df2,df3],axis=0,ignore_index=True)
df_cat5

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [38]:
'''
Merge Concat axis=1 for DIFFERENT Indexes
'''

df_cat3 = pd.concat([df1,df2,df3], axis=1)
df_cat3   #non-matching indexes have value NaN

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,,,,,,,,
1,A1,B1,C1,D1,,,,,,,,
2,A2,B2,C2,D2,,,,,,,,
3,A3,B3,C3,D3,,,,,,,,
4,,,,,A4,B4,C4,D4,,,,
5,,,,,A5,B5,C5,D5,,,,
6,,,,,A6,B6,C6,D6,,,,
7,,,,,A7,B7,C7,D7,,,,
8,,,,,,,,,A8,B8,C8,D8
9,,,,,,,,,A9,B9,C9,D9


In [49]:
df_cat3.fillna(value=0,inplace=True)
df_cat3

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,0,0,0,0,0,0,0,0
1,A1,B1,C1,D1,0,0,0,0,0,0,0,0
2,A2,B2,C2,D2,0,0,0,0,0,0,0,0
3,A3,B3,C3,D3,0,0,0,0,0,0,0,0
4,0,0,0,0,A4,B4,C4,D4,0,0,0,0
5,0,0,0,0,A5,B5,C5,D5,0,0,0,0
6,0,0,0,0,A6,B6,C6,D6,0,0,0,0
7,0,0,0,0,A7,B7,C7,D7,0,0,0,0
8,0,0,0,0,0,0,0,0,A8,B8,C8,D8
9,0,0,0,0,0,0,0,0,A9,B9,C9,D9


In [39]:
df_cat3['A']   #when same col name is chosen, then all 3 shown.

Unnamed: 0,A,A.1,A.2
0,A0,,
1,A1,,
2,A2,,
3,A3,,
4,,A4,
5,,A5,
6,,A6,
7,,A7,
8,,,A8
9,,,A9


In [42]:
'''
Merge Concat axis=1 for SAME Indexes
'''

df_cat4 = pd.concat( [df1,df4,df5],axis=1)
df_cat4

     A    B    C    D
0   A8   B8   C8   D8
1   A9   B9   C9   D9
2  A10  B10  C10  D10
3  A11  B11  C11  D11


Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,A4,B4,C4,D4,A8,B8,C8,D8
1,A1,B1,C1,D1,A5,B5,C5,D5,A9,B9,C9,D9
2,A2,B2,C2,D2,A6,B6,C6,D6,A10,B10,C10,D10
3,A3,B3,C3,D3,A7,B7,C7,D7,A11,B11,C11,D11


In [52]:
'''
Merge using merge() function
'''
left = pd.DataFrame({'key': ['K0', 'K8', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})

print left
print "----------------------"
print right

    A   B key
0  A0  B0  K0
1  A1  B1  K8
2  A2  B2  K2
3  A3  B3  K3
----------------------
    C   D key
0  C0  D0  K0
1  C1  D1  K1
2  C2  D2  K2
3  C3  D3  K3


In [53]:
merge1= pd.merge(left,right,how='inner',on='key')
merge1

Unnamed: 0,A,B,key,C,D
0,A0,B0,K0,C0,D0
1,A2,B2,K2,C2,D2
2,A3,B3,K3,C3,D3


In [55]:
#merge on multiple keys
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
    
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                               'key2': ['K0', 'K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2', 'C3'],
                                  'D': ['D0', 'D1', 'D2', 'D3']})


print left
print "-------------"
print right
print "--------------"
pd.merge(left, right, on=['key1', 'key2'], how='left')

    A   B key1 key2
0  A0  B0   K0   K0
1  A1  B1   K0   K1
2  A2  B2   K1   K0
3  A3  B3   K2   K1
-------------
    C   D key1 key2
0  C0  D0   K0   K0
1  C1  D1   K1   K0
2  C2  D2   K1   K0
3  C3  D3   K2   K0
--------------


Unnamed: 0,A,B,key1,key2,C,D
0,A0,B0,K0,K0,C0,D0
1,A1,B1,K0,K1,,
2,A2,B2,K1,K0,C1,D1
3,A2,B2,K1,K0,C2,D2
4,A3,B3,K2,K1,,


In [71]:
'''
Merge using join() function. 
It joins on row indexes. 
in case columns in 2 dataframes overlap, it throws error - soln: lsuffix='_left', rsuffix='_right'.
Or, use pd.concat() function
'''
print left
print "------"
print right
print "------"
# error: left.join(right,on=['key1','key2'],lsuffix='_left', rsuffix='_right')
left.join(right,lsuffix='_left', rsuffix='_right')

    A   B key1 key2
0  A0  B0   K0   K0
1  A1  B1   K0   K1
2  A2  B2   K1   K0
3  A3  B3   K2   K1
------
    C   D key1 key2
0  C0  D0   K0   K0
1  C1  D1   K1   K0
2  C2  D2   K1   K0
3  C3  D3   K2   K0
------


Unnamed: 0,A,B,key1_left,key2_left,C,D,key1_right,key2_right
0,A0,B0,K0,K0,C0,D0,K0,K0
1,A1,B1,K0,K1,C1,D1,K1,K0
2,A2,B2,K1,K0,C2,D2,K1,K0
3,A3,B3,K2,K1,C3,D3,K2,K0


### Note - default join in merge() is inner and join() is left join.

Suppose we want to apply some filter operation on dataframe before merging.
This is possible by applying filter operations like

eg1 -
index =  np.where(titanic_train['Fare'] == max(titanic_train['Fare']) )

titanic_train.loc[index]

eg2 -

df[df>0]


# Apply Function in Pandas

## map() function - python list objects
## apply() function - pandas Series and Dataframe

DataFrame.apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds)

Parameters
1. func - Function to apply to each column or row.

2. axis -  Axis along which the function is applied:

   {0 or ‘index’, 1 or ‘columns’}, default 0

    0 or ‘index’: apply function to each column. Opposite of conventional understanding of axis=0
    
    1 or ‘columns’: apply function to each row. Opposite of conventional understanding of axis=1


3. rawbool, default False
  Determines if row or column is passed as a Series or ndarray object:

   False : passes each row or column as a Series to the function.

   True : the passed function will receive ndarray objects instead. If you are just applying a NumPy reduction function this will achieve much better performance.
   

4. result_type{‘expand’, ‘reduce’, ‘broadcast’, None}, default None -
These only act when axis=1 (columns):

     ‘expand’ : list-like results will be turned into columns.

    ‘reduce’ : returns a Series if possible rather than expanding list-like results. This is the opposite of ‘expand’.

    ‘broadcast’ : results will be broadcast to the original shape of the DataFrame, the original index and columns will be retained.
    
    
    
5. args - tuple
    Positional arguments to pass to func in addition to the array/series.


6. **kwds
    Additional keyword arguments to pass as keywords arguments to func.


Returns

Series or DataFrame based on result of applying func along the given axis of the DataFrame.    

In [None]:
some rules -

1. prefer using apply() on Series or single DataFrame object.

2. when using apply() on an entire DataFrame, function to be passed should be such that it is applicable to all columns in the DataFrame, i.e. conforms to their data type.


In [130]:
#eg 1 case A- apply() on Series or single DataFrame object with axis=0.


# Define a function
def testfunc(x):
    if (x> 500):
        return (10*np.log10(x))
    else:
        return (x/10)
    
df = pd.DataFrame({'col1':[1,2,3,4,5,6,7,8,9,10],
                   'col2':[444,555,666,444,333,222,666,777,666,555],
                   'col3':'aaa bb c dd eeee fff gg h iii j'.split()})
print df    
print
df['FuncApplied'] = df['col2'].apply(lambda x : np.log(x))
df

   col1  col2  col3
0  1     444   aaa 
1  2     555   bb  
2  3     666   c   
3  4     444   dd  
4  5     333   eeee
5  6     222   fff 
6  7     666   gg  
7  8     777   h   
8  9     666   iii 
9  10    555   j   



Unnamed: 0,col1,col2,col3,FuncApplied
0,1,444,aaa,6.095825
1,2,555,bb,6.318968
2,3,666,c,6.50129
3,4,444,dd,6.095825
4,5,333,eeee,5.808142
5,6,222,fff,5.402677
6,7,666,gg,6.50129
7,8,777,h,6.65544
8,9,666,iii,6.50129
9,10,555,j,6.318968


In [137]:
#eg 1 case B- apply() on Series or single DataFrame object with axis=1.
'''
below code shall error out coz  there is no axis param for a Series.

Series.apply(func, convert_dtype=True, args=(), **kwds)
https://stackoverflow.com/questions/29155310/trouble-passing-in-lambda-to-apply-for-pandas-dataframe

'''

# Define a function
def testfunc(x):
    if (x> 500):
        return (10*np.log10(x))
    else:
        return (x/10)
    
df = pd.DataFrame({'col1':[1,2,3,4,5,6,7,8,9,10],
                   'col2':[444,555,666,444,333,222,666,777,666,555],
                   'col3':'aaa bb c dd eeee fff gg h iii j'.split()})
print df    
print
df['FuncApplied'] = df['col2'].apply(lambda x : np.log(x),axis=1) #This shall ERROR OUT.
df;



   col1  col2  col3
0  1     444   aaa 
1  2     555   bb  
2  3     666   c   
3  4     444   dd  
4  5     333   eeee
5  6     222   fff 
6  7     666   gg  
7  8     777   h   
8  9     666   iii 
9  10    555   j   



TypeError: <lambda>() got an unexpected keyword argument 'axis'

In [161]:
'''
Eg 2 - case A - apply on whole dataframe with axis=0
axis = 0 or ‘index’: apply function to EACH COLUMN. Opposite of conventional understanding of axis=0

So, here we apply function to EACH COLUMN
'''
print df.apply(lambda x: str(x),axis=0)
print
print df.apply(lambda x: len(str(x) ),axis=0)
print 
print df.apply(lambda x: len(x),axis=0)

col1    0    1 \n1    2 \n2    3 \n3    4 \n4    5 \n5    6 \n6    7 \n7    8 \n8    9 \n9    10\nName: col1, dtype: object                    
col2    0    444\n1    555\n2    666\n3    444\n4    333\n5    222\n6    666\n7    777\n8    666\n9    555\nName: col2, dtype: object          
col3    0    aaa \n1    bb  \n2    c   \n3    dd  \n4    eeee\n5    fff \n6    gg  \n7    h   \n8    iii \n9    j   \nName: col3, dtype: object
dtype: object

col1    105
col2    115
col3    125
dtype: int64

col1    10
col2    10
col3    10
dtype: int64


In [160]:
'''
Eg 2 - case B - apply on whole dataframe with axis=1
axis = 1 or ‘columns’: apply function to EACH ROW. Opposite of conventional understanding of axis=1
So here we apply function to each ROW taking all its columns at a time.
'''
print df.apply(lambda x: str(x),axis=1)
print
print df.apply(lambda x: len(str(x) ),axis=1)
print 
print df.apply(lambda x: len(x),axis=1)

0    col1    1  \ncol2    444\ncol3    aaa\nName: 0, dtype: object   
1    col1    2  \ncol2    555\ncol3    bb \nName: 1, dtype: object   
2    col1    3  \ncol2    666\ncol3    c  \nName: 2, dtype: object   
3    col1    4  \ncol2    444\ncol3    dd \nName: 3, dtype: object   
4    col1    5   \ncol2    333 \ncol3    eeee\nName: 4, dtype: object
5    col1    6  \ncol2    222\ncol3    fff\nName: 5, dtype: object   
6    col1    7  \ncol2    666\ncol3    gg \nName: 6, dtype: object   
7    col1    8  \ncol2    777\ncol3    h  \nName: 7, dtype: object   
8    col1    9  \ncol2    666\ncol3    iii\nName: 8, dtype: object   
9    col1    10 \ncol2    555\ncol3    j  \nName: 9, dtype: object   
dtype: object

0    58
1    58
2    58
3    58
4    61
5    58
6    58
7    58
8    58
9    58
dtype: int64

0    3
1    3
2    3
3    3
4    3
5    3
6    3
7    3
8    3
9    3
dtype: int64


In [169]:
'''
Eg 3 - use apply() on whole dataframe with a User Defined Function.
Note - we cant do axis=0 as it is individuall referring to ['col1 values']

'''
def myfunc(val_col):
    return  len(str(val_col['col1'] )) + len(str(val_col['col2']) ) + len(str(val_col['col3']) ) 

df.apply(myfunc,axis=1)


0    7
1    6
2    5
3    6
4    8
5    7
6    6
7    5
8    7
9    6
dtype: int64

In [202]:
def myfunc(val_col):
    x = []
    [x.append(i) for i in val_col]
#     print x
    a= '$'.join([str(elem) for elem in val_col]) 
    print a, type(a)
    return a
#     return pd.Series(x)


df = pd.DataFrame({'col1':[1,2,3,4,5,6,7,8,9,10],
                   'col2':[444,555,666,444,333,222,666,777,666,555],
                   'col3':'aaa bb c dd eeee fff gg h iii j'.split()})

# del df['col4']
df['col4']=df.apply(myfunc,axis=1)
df

1$444$aaa <type 'str'>
2$555$bb <type 'str'>
3$666$c <type 'str'>
4$444$dd <type 'str'>
5$333$eeee <type 'str'>
6$222$fff <type 'str'>
7$666$gg <type 'str'>
8$777$h <type 'str'>
9$666$iii <type 'str'>
10$555$j <type 'str'>


Unnamed: 0,col1,col2,col3,col4
0,1,444,aaa,1$444$aaa
1,2,555,bb,2$555$bb
2,3,666,c,3$666$c
3,4,444,dd,4$444$dd
4,5,333,eeee,5$333$eeee
5,6,222,fff,6$222$fff
6,7,666,gg,7$666$gg
7,8,777,h,8$777$h
8,9,666,iii,9$666$iii
9,10,555,j,10$555$j


In [207]:
df['FuncApplied'] = df['col2'].apply(lambda x : np.log(x))
df['FuncApplied_min'] = df['FuncApplied'].min()
df['FuncApplied_max'] = df['FuncApplied'].max()
df['FuncApplied_sum'] = df['FuncApplied'].sum()
df['FuncApplied_mean'] = df['FuncApplied'].mean()
df['FuncApplied_std'] = df['FuncApplied'].std()

print df
print 
df.sum()

   col1  col2  col3        col4  FuncApplied  FuncApplied_sum  \
0  1     444   aaa   1$444$aaa   6.095825     62.199715         
1  2     555   bb    2$555$bb    6.318968     62.199715         
2  3     666   c     3$666$c     6.501290     62.199715         
3  4     444   dd    4$444$dd    6.095825     62.199715         
4  5     333   eeee  5$333$eeee  5.808142     62.199715         
5  6     222   fff   6$222$fff   5.402677     62.199715         
6  7     666   gg    7$666$gg    6.501290     62.199715         
7  8     777   h     8$777$h     6.655440     62.199715         
8  9     666   iii   9$666$iii   6.501290     62.199715         
9  10    555   j     10$555$j    6.318968     62.199715         

   FuncApplied_mean  FuncApplied_std  FuncApplied_min  FuncApplied_max  
0  6.219971          0.382252         5.402677         6.65544          
1  6.219971          0.382252         5.402677         6.65544          
2  6.219971          0.382252         5.402677         6.65544   

col1                55                                                                                 
col2                5328                                                                               
col3                aaabbcddeeeefffgghiiij                                                             
col4                1$444$aaa2$555$bb3$666$c4$444$dd5$333$eeee6$222$fff7$666$gg8$777$h9$666$iii10$555$j
FuncApplied         62.1997                                                                            
FuncApplied_sum     621.997                                                                            
FuncApplied_mean    62.1997                                                                            
FuncApplied_std     3.82252                                                                            
FuncApplied_min     54.0268                                                                            
FuncApplied_max     66.5544                                     

## Sorting in Pandas

#### df.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'

In [208]:
df.sort_values(by='FuncApplied',ascending=False) #inplace=False by default

Unnamed: 0,col1,col2,col3,col4,FuncApplied,FuncApplied_sum,FuncApplied_mean,FuncApplied_std,FuncApplied_min,FuncApplied_max
7,8,777,h,8$777$h,6.65544,62.199715,6.219971,0.382252,5.402677,6.65544
2,3,666,c,3$666$c,6.50129,62.199715,6.219971,0.382252,5.402677,6.65544
6,7,666,gg,7$666$gg,6.50129,62.199715,6.219971,0.382252,5.402677,6.65544
8,9,666,iii,9$666$iii,6.50129,62.199715,6.219971,0.382252,5.402677,6.65544
1,2,555,bb,2$555$bb,6.318968,62.199715,6.219971,0.382252,5.402677,6.65544
9,10,555,j,10$555$j,6.318968,62.199715,6.219971,0.382252,5.402677,6.65544
0,1,444,aaa,1$444$aaa,6.095825,62.199715,6.219971,0.382252,5.402677,6.65544
3,4,444,dd,4$444$dd,6.095825,62.199715,6.219971,0.382252,5.402677,6.65544
4,5,333,eeee,5$333$eeee,5.808142,62.199715,6.219971,0.382252,5.402677,6.65544
5,6,222,fff,6$222$fff,5.402677,62.199715,6.219971,0.382252,5.402677,6.65544


In [210]:
#### Checking Null

df = pd.DataFrame({'col1':[1,2,3,np.nan],
                   'col2':[None,555,666,444],
                   'col3':['abc','def','ghi','xyz']})

print df
df.isnull()

   col1   col2 col3
0  1.0  NaN     abc
1  2.0   555.0  def
2  3.0   666.0  ghi
3 NaN    444.0  xyz


Unnamed: 0,col1,col2,col3
0,False,True,False
1,False,False,False
2,False,False,False
3,True,False,False


## End of Notebook for Pandas

In [199]:
del df['col4']
print df

   col1  col2  col3
0  1     444   aaa 
1  2     555   bb  
2  3     666   c   
3  4     444   dd  
4  5     333   eeee
5  6     222   fff 
6  7     666   gg  
7  8     777   h   
8  9     666   iii 
9  10    555   j   


In [141]:
sum??

In [73]:
# Define a function
def testfunc(x):
    if (x> 500):
        return (10*np.log10(x))
    else:
        return (x/10)

In [74]:
df = pd.DataFrame({'col1':[1,2,3,4,5,6,7,8,9,10],
                   'col2':[444,555,666,444,333,222,666,777,666,555],
                   'col3':'aaa bb c dd eeee fff gg h iii j'.split()})
df

Unnamed: 0,col1,col2,col3
0,1,444,aaa
1,2,555,bb
2,3,666,c
3,4,444,dd
4,5,333,eeee
5,6,222,fff
6,7,666,gg
7,8,777,h
8,9,666,iii
9,10,555,j


In [75]:
df['FuncApplied'] = df['col2'].apply(lambda x : np.log(x))
print(df)

   col1  col2  col3  FuncApplied
0     1   444   aaa     6.095825
1     2   555    bb     6.318968
2     3   666     c     6.501290
3     4   444    dd     6.095825
4     5   333  eeee     5.808142
5     6   222   fff     5.402677
6     7   666    gg     6.501290
7     8   777     h     6.655440
8     9   666   iii     6.501290
9    10   555     j     6.318968


In [78]:
df['col2'].apply(lambda x: len(str(x)))

0    3
1    3
2    3
3    3
4    3
5    3
6    3
7    3
8    3
9    3
Name: col2, dtype: int64

In [79]:
df['col2']

0    444
1    555
2    666
3    444
4    333
5    222
6    666
7    777
8    666
9    555
Name: col2, dtype: int64

In [83]:
df.apply(len)

col1           10
col2           10
col3           10
FuncApplied    10
dtype: int64

In [81]:
df

Unnamed: 0,col1,col2,col3,FuncApplied
0,1,444,aaa,6.095825
1,2,555,bb,6.318968
2,3,666,c,6.50129
3,4,444,dd,6.095825
4,5,333,eeee,5.808142
5,6,222,fff,5.402677
6,7,666,gg,6.50129
7,8,777,h,6.65544
8,9,666,iii,6.50129
9,10,555,j,6.318968


In [88]:
df.apply(lambda x: len(str(x)),axis=1)

0    114
1    114
2    114
3    114
4    114
5    114
6    114
7    114
8    114
9    114
dtype: int64

In [93]:
df.apply(lambda x: str(x),axis=1)

0    col1           1      \ncol2           444    \ncol3           aaa    \nFuncApplied    6.09582\nName: 0, dtype: object
1    col1           2      \ncol2           555    \ncol3           bb     \nFuncApplied    6.31897\nName: 1, dtype: object
2    col1           3      \ncol2           666    \ncol3           c      \nFuncApplied    6.50129\nName: 2, dtype: object
3    col1           4      \ncol2           444    \ncol3           dd     \nFuncApplied    6.09582\nName: 3, dtype: object
4    col1           5      \ncol2           333    \ncol3           eeee   \nFuncApplied    5.80814\nName: 4, dtype: object
5    col1           6      \ncol2           222    \ncol3           fff    \nFuncApplied    5.40268\nName: 5, dtype: object
6    col1           7      \ncol2           666    \ncol3           gg     \nFuncApplied    6.50129\nName: 6, dtype: object
7    col1           8      \ncol2           777    \ncol3           h      \nFuncApplied    6.65544\nName: 7, dtype: object
8    col

In [92]:
pd.set_option('display.max_colwidth', -1)

In [94]:
df.apply(lambda x: len(str(x)))

col1           105
col2           115
col3           125
FuncApplied    162
dtype: int64

In [95]:
df.apply(lambda x: str(x))

col1           0    1 \n1    2 \n2    3 \n3    4 \n4    5 \n5    6 \n6    7 \n7    8 \n8    9 \n9    10\nName: col1, dtype: object                                                         
col2           0    444\n1    555\n2    666\n3    444\n4    333\n5    222\n6    666\n7    777\n8    666\n9    555\nName: col2, dtype: object                                               
col3           0    aaa \n1    bb  \n2    c   \n3    dd  \n4    eeee\n5    fff \n6    gg  \n7    h   \n8    iii \n9    j   \nName: col3, dtype: object                                     
FuncApplied    0    6.09582\n1    6.31897\n2    6.50129\n3    6.09582\n4    5.80814\n5    5.40268\n6    6.50129\n7    6.65544\n8    6.50129\n9    6.31897\nName: FuncApplied, dtype: object
dtype: object

In [118]:
df.apply(lambda x: str(x),axis=1)

0    col1           1      \ncol2           444    \ncol3           aaa    \nFuncApplied    6.09582\nName: 0, dtype: object
1    col1           2      \ncol2           555    \ncol3           bb     \nFuncApplied    6.31897\nName: 1, dtype: object
2    col1           3      \ncol2           666    \ncol3           c      \nFuncApplied    6.50129\nName: 2, dtype: object
3    col1           4      \ncol2           444    \ncol3           dd     \nFuncApplied    6.09582\nName: 3, dtype: object
4    col1           5      \ncol2           333    \ncol3           eeee   \nFuncApplied    5.80814\nName: 4, dtype: object
5    col1           6      \ncol2           222    \ncol3           fff    \nFuncApplied    5.40268\nName: 5, dtype: object
6    col1           7      \ncol2           666    \ncol3           gg     \nFuncApplied    6.50129\nName: 6, dtype: object
7    col1           8      \ncol2           777    \ncol3           h      \nFuncApplied    6.65544\nName: 7, dtype: object
8    col

In [119]:
df.apply(lambda x: len(str(x)),axis=1)

0    114
1    114
2    114
3    114
4    114
5    114
6    114
7    114
8    114
9    114
dtype: int64

In [128]:
def myfunc(val_col):
    return  len(str(val_col['col1'] )) + len(str(val_col['col2']) ) + len(str(val_col['col3']) ) + len(str(val_col['FuncApplied'] ))

df.apply(myfunc,axis=1)

0    20
1    19
2    18
3    19
4    21
5    20
6    19
7    18
8    20
9    19
dtype: int64

In [125]:
df.apply(len,axis=1)

0    4
1    4
2    4
3    4
4    4
5    4
6    4
7    4
8    4
9    4
dtype: int64

In [84]:
df['col1'].apply(lambda x: len(str(x)))

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    2
Name: col1, dtype: int64

In [85]:
df['col2'].apply(lambda x: len(str(x)))

0    3
1    3
2    3
3    3
4    3
5    3
6    3
7    3
8    3
9    3
Name: col2, dtype: int64

In [86]:
df['col3'].apply(lambda x: len(str(x)))

0    3
1    2
2    1
3    2
4    4
5    3
6    2
7    1
8    3
9    1
Name: col3, dtype: int64

In [87]:
df['FuncApplied'].apply(lambda x: len(str(x)))

0    13
1    13
2    13
3    13
4    13
5    13
6    13
7    13
8    13
9    13
Name: FuncApplied, dtype: int64

In [117]:
df['FuncApplied'].apply?

In [None]:
df['FuncApplied'].apply