## Pandas experience

### 1. Making df from various ways

In [1]:
import pandas as pd
import numpy as np

#### 1.1. Making df from dict where value is a list

In [2]:
dic =  {"a" : [1 ,2, 3],
        "b" : [4, np.nan, 5],
        "c" : ['', 6, 7]}
df = pd.DataFrame(dic)
# df = pd.DataFrame(dic, index = [1, 2, 3]) # if want to make user's index
df

Unnamed: 0,a,b,c
0,1,4.0,
1,2,,6.0
2,3,5.0,7.0


#### 1.2. Making df from list of dics

In [3]:
dics = [
    {'a': 1, 'b': 4, 'c': ''},
    {'a': 2, 'b': np.nan, 'c': 6},
    {'a': 3, 'b': 5, 'c': 7}
]
df = pd.DataFrame(dics)
df

Unnamed: 0,a,b,c
0,1,4.0,
1,2,,6.0
2,3,5.0,7.0


#### 1.3. Making df from list of list

In [4]:
list_of_list = [
    [1 , 4, ''],  # to become row value in dataframe
    [2, np.nan, 6],
    [3, 5, 7]
]
df = pd.DataFrame(list_of_list,
                  columns=['a', 'b', 'c'],
                  #index = [1, 2, 3]  
                 )
df

Unnamed: 0,a,b,c
0,1,4.0,
1,2,,6.0
2,3,5.0,7.0


#### 1.4 Making df as multiIndex

In [5]:
dic = {"a" : [1 ,2, 3],
       "b" : [4, np.nan, 5],
       "c" : ['', 6, 7]}
df = pd.DataFrame(dic,
                  index = pd.MultiIndex.from_tuples([('d', 1),('d', 2),('e', 2)],
                  names=['i1','i2']))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
i1,i2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,1,4.0,
d,2,2,,6.0
e,2,3,5.0,7.0


### 2. Summary statistics of df

In [6]:
import pandas as pd
import seaborn as sns
import numpy as np

In [7]:
df = sns.load_dataset('iris')
df.shape

(150, 5)

In [8]:
df.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [9]:
df['species'].value_counts()

versicolor    50
virginica     50
setosa        50
Name: species, dtype: int64

In [10]:
len(df)

150

In [11]:
len(df) == df.shape[0]

True

In [12]:
df['species'].nunique()

3

In [13]:
df.describe(include='all')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
count,150.0,150.0,150.0,150.0,150
unique,,,,,3
top,,,,,versicolor
freq,,,,,50
mean,5.843333,3.057333,3.758,1.199333,
std,0.828066,0.435866,1.765298,0.762238,
min,4.3,2.0,1.0,0.1,
25%,5.1,2.8,1.6,0.3,
50%,5.8,3.0,4.35,1.3,
75%,6.4,3.3,5.1,1.8,


In [14]:
df.describe(include=[np.object])  # np.object is categorical values

Unnamed: 0,species
count,150
unique,3
top,versicolor
freq,50


In [15]:
df.describe(exclude=[np.object])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [16]:
df.describe(include=[np.number])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [17]:
df['petal_width'].sum()

179.90000000000003

In [18]:
df['petal_width'].count()

150

In [19]:
df.median()

sepal_length    5.80
sepal_width     3.00
petal_length    4.35
petal_width     1.30
dtype: float64

In [20]:
df.mean()

sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

In [21]:
df['petal_width'].quantile([0.25,0.75])

0.25    0.3
0.75    1.8
Name: petal_width, dtype: float64

In [22]:
df.min()

sepal_length       4.3
sepal_width          2
petal_length         1
petal_width        0.1
species         setosa
dtype: object

In [23]:
df.max()

sepal_length          7.9
sepal_width           4.4
petal_length          6.9
petal_width           2.5
species         virginica
dtype: object

In [24]:
df.var()

sepal_length    0.685694
sepal_width     0.189979
petal_length    3.116278
petal_width     0.581006
dtype: float64

In [25]:
df.std()

sepal_length    0.828066
sepal_width     0.435866
petal_length    1.765298
petal_width     0.762238
dtype: float64

### 3. Acess to subset of df 

In [26]:
dic =  {"a" : [1 ,2, 3],
        "b" : [4, np.nan, 5],
        "c" : ['', 6, 7]}
df = pd.DataFrame(dic)
df

Unnamed: 0,a,b,c
0,1,4.0,
1,2,,6.0
2,3,5.0,7.0


In [27]:
df[df['a'] < 2]

Unnamed: 0,a,b,c
0,1,4.0,


In [28]:
df[df['c'] != 7]

Unnamed: 0,a,b,c
0,1,4.0,
1,2,,6.0


In [29]:
df['a'].isin([2, 3])

0    False
1     True
2     True
Name: a, dtype: bool

In [30]:
pd.isnull(df)

Unnamed: 0,a,b,c
0,False,False,False
1,False,True,False
2,False,False,False


In [31]:
df['b'].isnull().sum()   # count the null value in 'b' column

1

In [32]:
pd.notnull(df)

Unnamed: 0,a,b,c
0,True,True,True
1,True,False,True
2,True,True,True


In [33]:
df.notnull().sum()

a    3
b    2
c    3
dtype: int64

In [34]:
df['a'].notnull()

0    True
1    True
2    True
Name: a, dtype: bool

> Logic: &, |, ~, ^, df.any(), df.all(), and, or, not, xor, any, all, isna()

In [35]:
df

Unnamed: 0,a,b,c
0,1,4.0,
1,2,,6.0
2,3,5.0,7.0


In [36]:
df.isna()

Unnamed: 0,a,b,c
0,False,False,False
1,False,True,False
2,False,False,False


In [37]:
df.isna().any(axis=None)

a    False
b     True
c    False
dtype: bool

In [38]:
df[df.b == 3] | df[df.a == 5]

Unnamed: 0,a,b,c


In [39]:
df.sample(frac=0.3)   # sample(): Return a random sample of items from an axis of object.

Unnamed: 0,a,b,c
0,1,4.0,


In [40]:
df.sample(n=2)

Unnamed: 0,a,b,c
2,3,5.0,7
1,2,,6


In [41]:
df.loc[[1, 2], ["a", "b"]]  # df.loc[index_row: a list, colums: a list]

Unnamed: 0,a,b
1,2,
2,3,5.0


In [42]:
df.iloc[-2:]  # reverse order of row: -1 (last row), -2,...

Unnamed: 0,a,b,c
1,2,,6
2,3,5.0,7


In [43]:
df.nlargest(1, 'a')

Unnamed: 0,a,b,c
2,3,5.0,7


In [44]:
df.nlargest(2, 'a')  # Return the first n rows with the largest values in columns, in descending order.

Unnamed: 0,a,b,c
2,3,5.0,7
1,2,,6


In [45]:
df.nsmallest(1, 'a')

Unnamed: 0,a,b,c
0,1,4.0,


In [46]:
# logical - combine
df.loc[(df['a'] >= 2) & (df['b'] <= 101)]

Unnamed: 0,a,b,c
2,3,5.0,7


In [47]:
#check for None 
df.loc[df['b'].isnull()].head(1)

Unnamed: 0,a,b,c
1,2,,6


In [48]:
df.loc[~df['b'].isnull()].head(1)

Unnamed: 0,a,b,c
0,1,4.0,


### 4. Access to columns of df

In [49]:
import pandas as pd
import seaborn as sns

In [50]:
df = sns.load_dataset("iris")
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [51]:
columns = ['sepal_width','sepal_length','species']
df[columns].head(3)

Unnamed: 0,sepal_width,sepal_length,species
0,3.5,5.1,setosa
1,3.0,4.9,setosa
2,3.2,4.7,setosa


In [52]:
df.filter(regex='length$')  # all columns has'lengh' at the end

Unnamed: 0,sepal_length,petal_length
0,5.1,1.4
1,4.9,1.4
2,4.7,1.3
3,4.6,1.5
4,5.0,1.4
5,5.4,1.7
6,4.6,1.4
7,5.0,1.5
8,4.4,1.4
9,4.9,1.5


In [53]:
df.filter(regex='^sepal' )  # all columns has 'sepal' at the begining

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
5,5.4,3.9
6,4.6,3.4
7,5.0,3.4
8,4.4,2.9
9,4.9,3.1


In [54]:
df.filter(regex='^(?!species$).*' )  # except species

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [55]:
df.loc[2:3,'sepal_width':'petal_width']

Unnamed: 0,sepal_width,petal_length,petal_width
2,3.2,1.3,0.2
3,3.1,1.5,0.2


In [56]:
df.iloc[:3,[1,3]]

Unnamed: 0,sepal_width,petal_width
0,3.5,0.2
1,3.0,0.2
2,3.2,0.2


In [57]:
df.loc[df['sepal_length'] > 5, ['sepal_length','sepal_width']].head()

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
5,5.4,3.9
10,5.4,3.7
14,5.8,4.0
15,5.7,4.4


### 5. Handle missing data

In [58]:
import pandas as pd
import numpy as np

In [59]:
df = pd.DataFrame([[np.nan, 9, np.nan, 0], [2, 4, np.nan, 1],
                  [np.nan, np.nan, np.nan, 10]],
                  columns=list('abcd'))
df

Unnamed: 0,a,b,c,d
0,,9.0,,0
1,2.0,4.0,,1
2,,,,10


In [60]:
df.dropna(axis=1, how='any')  # axis=1: column

Unnamed: 0,d
0,0
1,1
2,10


In [61]:
df.dropna(axis=0, how='all')  # axis=0: row

Unnamed: 0,a,b,c,d
0,,9.0,,0
1,2.0,4.0,,1
2,,,,10


In [62]:
df.fillna('filled_by_this_value')

Unnamed: 0,a,b,c,d
0,filled_by_this_value,9,filled_by_this_value,0
1,2,4,filled_by_this_value,1
2,filled_by_this_value,filled_by_this_value,filled_by_this_value,10


In [63]:
df = pd.DataFrame([[np.nan, 9, np.nan, 0], [2, 4, np.nan, 1],
                  [np.nan, np.nan, np.nan, 10]],
                  columns=list('abcd'))
df

Unnamed: 0,a,b,c,d
0,,9.0,,0
1,2.0,4.0,,1
2,,,,10


In [64]:
values = {'a': 0, 'b': 1, 'c': 'Added it', 'd': 3}
df.fillna(value=values)

Unnamed: 0,a,b,c,d
0,0.0,9.0,Added it,0
1,2.0,4.0,Added it,1
2,0.0,1.0,Added it,10


In [65]:
df.isnull().sum()

a    2
b    1
c    3
d    0
dtype: int64

In [66]:
df.notnull().sum()

a    1
b    2
c    0
d    3
dtype: int64

### 6. Add new column, delete columns

In [67]:
dics = [
    {'a': 10, 'b': 4, 'c': ''},
    {'a': 2, 'b': np.nan, 'c': 6},
    {'a': 3, 'b': 5, 'c': 7}
]
df = pd.DataFrame(dics)
df

Unnamed: 0,a,b,c
0,10,4.0,
1,2,,6.0
2,3,5.0,7.0


In [68]:
# df.assign?  # to read documentation of df.assign

In [69]:
df =df.assign(ln_a = lambda x: np.log(x['a'])) # df['ln_a'] = np.log(df['a'])
df

Unnamed: 0,a,b,c,ln_a
0,10,4.0,,2.302585
1,2,,6.0,0.693147
2,3,5.0,7.0,1.098612


In [70]:
_df = pd.qcut(df['a'], 2, labels=["good", "bad"])  # Quantile-based discretization function
                                                   # <=2: good, >2: bad
_df

0     bad
1    good
2    good
Name: a, dtype: category
Categories (2, object): [good < bad]

In [71]:
df.max(axis=0)  # axis=1: column

a       10.000000
b        5.000000
ln_a     2.302585
dtype: float64

In [72]:
df.max(axis=1)  # axis=1: index

0    10.0
1     2.0
2     5.0
dtype: float64

In [73]:
df.min(axis=0)

a       2.000000
b       4.000000
ln_a    0.693147
dtype: float64

In [74]:
df["ln_a"].clip(lower=0.5,upper=0.7)  # <0,5 --> 0.5; >0.7 --> 0.7

0    0.700000
1    0.693147
2    0.700000
Name: ln_a, dtype: float64

In [75]:
df["a"].abs()

0    10
1     2
2     3
Name: a, dtype: int64

In [76]:
# Delete using drop() 
df = df.drop(['a'], axis=1)
# Delete using del 
# del df['column_to_remove']
df

Unnamed: 0,b,c,ln_a
0,4.0,,2.302585
1,,6.0,0.693147
2,5.0,7.0,1.098612


### 7. Reshape data

In [77]:
import pandas as pd
import numpy as np

dics = [
    {'a': 3, 'b': 4, 'c': '6'},
    {'a': 2, 'b': np.nan, 'c': 6},
    {'a': 3, 'b': 5, 'c': 7}
]
df = pd.DataFrame(dics)
df

Unnamed: 0,a,b,c
0,3,4.0,6
1,2,,6
2,3,5.0,7


In [78]:
# df.sort_values?

In [79]:
df.sort_values(['a', 'c'], ascending=[False, True], inplace=True)
df

Unnamed: 0,a,b,c
2,3,5.0,7
0,3,4.0,6
1,2,,6


In [80]:
df = df.rename(columns = {'b': 'BB','c':'CC'})
df

Unnamed: 0,a,BB,CC
2,3,5.0,7
0,3,4.0,6
1,2,,6


In [81]:
df.sort_index()

Unnamed: 0,a,BB,CC
0,3,4.0,6
1,2,,6
2,3,5.0,7


In [82]:
# df.reset_index?

In [83]:
df.drop(columns=['BB'], inplace=True)
df

Unnamed: 0,a,CC
2,3,7
0,3,6
1,2,6


In [84]:
# pd.melt? Unpivot: change column to row

In [85]:
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                   'B': {0: 1, 1: 3, 2: 5},
                   'C': {0: 2, 1: 4, 2: 6}})
df

Unnamed: 0,A,B,C
0,a,1,2
1,b,3,4
2,c,5,6


In [86]:
# Unpivot. change column B to rows
pd.melt(df, id_vars=['A'], value_vars=['B'])

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5


In [87]:
df = pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])
df

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,a,C,2
4,b,C,4
5,c,C,6


In [88]:
df.groupby("variable").sum()

Unnamed: 0_level_0,value
variable,Unnamed: 1_level_1
B,9
C,12


In [89]:
df

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,a,C,2
4,b,C,4
5,c,C,6


In [90]:
df.groupby(['A', 'variable'])['value'].sum()

A  variable
a  B           1
   C           2
b  B           3
   C           4
c  B           5
   C           6
Name: value, dtype: int64

In [91]:
#### count duplicated values in one or more columns
df_count = df.groupby(['A']).size().reset_index(name='count')
df_count

Unnamed: 0,A,count
0,a,2
1,b,2
2,c,2


In [92]:
df

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,a,C,2
4,b,C,4
5,c,C,6


In [93]:
# Pivot: change row to colum
df.pivot(columns='variable', values='value')

variable,B,C
0,1.0,
1,3.0,
2,5.0,
3,,2.0
4,,4.0
5,,6.0


### 8. Concat

In [94]:
df1 = pd.DataFrame([['a', 1], ['b', 2]],
                   columns=['letter', 'number'])
df1

Unnamed: 0,letter,number
0,a,1
1,b,2


In [95]:
df2 = pd.DataFrame([['c', 3], ['d', 4]],
                   columns=['letter', 'number'])
df2

Unnamed: 0,letter,number
0,c,3
1,d,4


In [96]:
df = pd.concat([df1, df2])
df

Unnamed: 0,letter,number
0,a,1
1,b,2
0,c,3
1,d,4


### 9. Merge dataframes

In [97]:
df1 = pd.DataFrame({"x1": ["A", "B", "C"], "x2": [9, 99, 999]})
df1

Unnamed: 0,x1,x2
0,A,9
1,B,99
2,C,999


In [98]:
df2 = pd.DataFrame({"x1": ["A", "B", "D"], "x3": ["Test1", "Test2", "Test3"]})
df2

Unnamed: 0,x1,x3
0,A,Test1
1,B,Test2
2,D,Test3


In [99]:
df = pd.merge(df1, df2, how='left', on='x1')  # left_on and right_on if not the same column names
df

Unnamed: 0,x1,x2,x3
0,A,9,Test1
1,B,99,Test2
2,C,999,


In [100]:
df = pd.merge(df1, df2, how='right', on='x1')  #
df

Unnamed: 0,x1,x2,x3
0,A,9.0,Test1
1,B,99.0,Test2
2,D,,Test3


In [101]:
df = pd.merge(df1, df2, how='inner', on='x1')  #
df

Unnamed: 0,x1,x2,x3
0,A,9,Test1
1,B,99,Test2


In [102]:
df = pd.merge(df1, df2, how='outer', on='x1')  #
df

Unnamed: 0,x1,x2,x3
0,A,9.0,Test1
1,B,99.0,Test2
2,C,999.0,
3,D,,Test3


In [103]:
df1.x1.isin(df2.x1)  # isin([a list])

0     True
1     True
2    False
Name: x1, dtype: bool

In [104]:
df1[df1.x1.isin(df2.x1)]

Unnamed: 0,x1,x2
0,A,9
1,B,99


In [105]:
df1[~df1.x1.isin(df2.x1)]

Unnamed: 0,x1,x2
2,C,999


### 10. Rank

In [106]:
dic =  {"a" : [1 , 2, 1],
        "b" : [4, np.nan, 5],
        "c" : ['', 6, 7]}
df = pd.DataFrame(dic)
df

Unnamed: 0,a,b,c
0,1,4.0,
1,2,,6.0
2,1,5.0,7.0


In [107]:
df_rank = df["a"].rank(method='dense')  # method : {‘average’, ‘min’, ‘max’, ‘first’, ‘dense’}, default ‘average’
df_rank

0    1.0
1    2.0
2    1.0
Name: a, dtype: float64

In [108]:
df2 = pd.DataFrame(
[[4, 7, 10],
[5, 11, 8],
[6, 9, 12]],
index=[1, 2, 3],
columns=['a', 'b', 'c'])
df2

Unnamed: 0,a,b,c
1,4,7,10
2,5,11,8
3,6,9,12


In [109]:
df2.cumsum()   # cummulative sum for each column

Unnamed: 0,a,b,c
1,4,7,10
2,9,18,18
3,15,27,30


In [110]:
df2.cummax()

Unnamed: 0,a,b,c
1,4,7,10
2,5,11,10
3,6,11,12


In [111]:
df2.cummin()

Unnamed: 0,a,b,c
1,4,7,10
2,4,7,8
3,4,7,8


In [112]:
df2.cumprod()

Unnamed: 0,a,b,c
1,4,7,10
2,20,77,80
3,120,693,960


In [113]:
df2['b'].shift(2)

1    NaN
2    NaN
3    7.0
Name: b, dtype: float64

### 11. Moving average

In [114]:
import pandas as pd
dic =  {"a" : [1 , 2, 3],
        "b" : [4, np.nan, 5],
        "c" : ['', 6, 7]}
df = pd.DataFrame(dic)
df

Unnamed: 0,a,b,c
0,1,4.0,
1,2,,6.0
2,3,5.0,7.0


In [115]:
# Add column with Moving Average
df["ma"] = df.rolling(window=2)['a'].mean()
df

Unnamed: 0,a,b,c,ma
0,1,4.0,,
1,2,,6.0,1.5
2,3,5.0,7.0,2.5


In [116]:
# df.expanding?

### 12. set options

In [117]:
pd.set_option('display.max_rows', 1000000)

In [118]:
pd.reset_option('display.max_rows')

### 13. apply() function

In [119]:
import pandas as pd
df = pd.DataFrame({'ChemicalID': ['AA', 'BB', 'AA'],
                   'structureID': [ [{'k1': 'v1'}, {'k2':'v2'}], ['k99_v99'], [{'k3': 'v3'}] ],
                   'Other': ['Other1', 'Other2', 'Other1']
                  })
df

Unnamed: 0,ChemicalID,structureID,Other
0,AA,"[{'k1': 'v1'}, {'k2': 'v2'}]",Other1
1,BB,[k99_v99],Other2
2,AA,[{'k3': 'v3'}],Other1


In [120]:
df_reduce = df.groupby(['ChemicalID'])['structureID'].apply(sum).reset_index(name='structureID')
df_reduce

Unnamed: 0,ChemicalID,structureID
0,AA,"[{'k1': 'v1'}, {'k2': 'v2'}, {'k3': 'v3'}]"
1,BB,[k99_v99]


In [121]:
# Before merge delete a un-needed column
df.drop('structureID', axis=1, inplace=True)
print('df after delete 1 col = \n', df)

df after delete 1 col = 
   ChemicalID   Other
0         AA  Other1
1         BB  Other2
2         AA  Other1


In [122]:
cols_chosen = ['ChemicalID', 'Other']
print(cols_chosen)
# df_new = df.groupby(cols_chosen)['PDB_ID'].count()
# df.groupby('A')['C'].apply(lambda x: "{%s}" % ', '.join(x))
df1 = df.groupby(cols_chosen)['ChemicalID'].apply(lambda x: ''.join(x)).reset_index(name='PDB_ID')
df1

['ChemicalID', 'Other']


Unnamed: 0,ChemicalID,Other,PDB_ID
0,AA,Other1,AAAA
1,BB,Other2,BB


In [123]:
df = pd.DataFrame({'B': [1, 2, 3], 'C': [4, 5, 6]})
def color_negative_red(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for negative
    strings, black otherwise.
    """
    color = 'red' if val < 4 else 'black'
    return 'color: %s' % color

In [124]:
s = df.style.applymap(color_negative_red)
s

Unnamed: 0,B,C
0,1,4
1,2,5
2,3,6


### 14. Read/Write data

In [126]:
# Need to specify input and output file paths
# df = pd.read_csv('data_test.csv', header=None) # header included by default
# df.to_csv('out.csv',index=False)
# df = pd.read_excel('Excel.xlsx',sheetname='Sheet1')
# df.to_excel('out.xlsx',sheet_name='NewSheet')
# data = pd.read_html('a_link') # return a list