## Pandas experience

### 1. Making df from various ways

In [None]:
import pandas as pd
import numpy as np

#### 1.1. Making df from dict where value is a list

In [None]:
dic =  {"a" : [1 ,2, 3],
        "b" : [4, np.nan, 5],
        "c" : ['', 6, 7]}
df = pd.DataFrame(dic)
# df = pd.DataFrame(dic, index = [1, 2, 3]) # if want to make user's index
df

#### 1.2. Making df from list of dics

In [None]:
dics = [
    {'a': 1, 'b': 4, 'c': ''},
    {'a': 2, 'b': np.nan, 'c': 6},
    {'a': 3, 'b': 5, 'c': 7}
]
df = pd.DataFrame(dics)
df

#### 1.3. Making df from list of list

In [None]:
list_of_list = [
    [1 , 4, ''],  # to become row value in dataframe
    [2, np.nan, 6],
    [3, 5, 7]
]
df = pd.DataFrame(list_of_list,
                  columns=['a', 'b', 'c'],
                  #index = [1, 2, 3]  
                 )
df

#### 1.4 Making df as multiIndex

In [None]:
dic = {"a" : [1 ,2, 3],
       "b" : [4, np.nan, 5],
       "c" : ['', 6, 7]}
df = pd.DataFrame(dic,
                  index = pd.MultiIndex.from_tuples([('d', 1),('d', 2),('e', 2)],
                  names=['i1','i2']))
df

### 2. Summary statistics of df

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

In [None]:
df = sns.load_dataset('iris')
df.shape

In [None]:
df.head(10)

In [None]:
df['species'].value_counts()

In [None]:
len(df)

In [None]:
len(df) == df.shape[0]

In [None]:
df['species'].nunique()

In [None]:
df.describe(include='all')

In [None]:
df.describe(include=[np.object])  # np.object is categorical values

In [None]:
df.describe(exclude=[np.object])

In [None]:
df.describe(include=[np.number])

In [None]:
df['petal_width'].sum()

In [None]:
df['petal_width'].count()

In [None]:
df.median()

In [None]:
df.mean()

In [None]:
df['petal_width'].quantile([0.25,0.75])

In [None]:
df.min()

In [None]:
df.max()

In [None]:
df.var()

In [None]:
df.std()

### 3. Acess to subset of df 

In [None]:
dic =  {"a" : [1 ,2, 3],
        "b" : [4, np.nan, 5],
        "c" : ['', 6, 7]}
df = pd.DataFrame(dic)
df

In [None]:
df[df['a'] < 2]

In [None]:
df[df['c'] != 7]

In [None]:
df['a'].isin([2, 3])

In [None]:
pd.isnull(df)

In [None]:
df['b'].isnull().sum()   # count the null value in 'b' column

In [None]:
pd.notnull(df)

In [None]:
df.notnull().sum()

In [None]:
df['a'].notnull()

> Logic: &, |, ~, ^, df.any(), df.all(), and, or, not, xor, any, all, isna()

In [None]:
df

In [None]:
df.isna()

In [None]:
df.isna().any(axis=None)

In [None]:
df[df.b == 3] | df[df.a == 5]

In [None]:
df.sample(frac=0.3)   # sample(): Return a random sample of items from an axis of object.

In [None]:
df.sample(n=2)

In [None]:
df.loc[[1, 2], ["a", "b"]]  # df.loc[index_row: a list, colums: a list]

In [None]:
df.iloc[-2:]  # reverse order of row: -1 (last row), -2,...

In [None]:
df.nlargest(1, 'a')

In [None]:
df.nlargest(2, 'a')  # Return the first n rows with the largest values in columns, in descending order.

In [None]:
df.nsmallest(1, 'a')

In [None]:
# logical - combine
df.loc[(df['a'] >= 2) & (df['b'] <= 101)]

In [None]:
#check for None 
df.loc[df['b'].isnull()].head(1)

In [None]:
df.loc[~df['b'].isnull()].head(1)

### 4. Access to columns of df

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
df = sns.load_dataset("iris")
df.head()

In [None]:
columns = ['sepal_width','sepal_length','species']
df[columns].head(3)

In [None]:
df.filter(regex='length$')  # all columns has'lengh' at the end

In [None]:
df.filter(regex='^sepal' )  # all columns has 'sepal' at the begining

In [None]:
df.filter(regex='^(?!species$).*' )  # except species

In [None]:
df.loc[2:3,'sepal_width':'petal_width']

In [None]:
df.iloc[:3,[1,3]]

In [None]:
df.loc[df['sepal_length'] > 5, ['sepal_length','sepal_width']].head()

### 5. Handle missing data

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.DataFrame([[np.nan, 9, np.nan, 0], [2, 4, np.nan, 1],
                  [np.nan, np.nan, np.nan, 10]],
                  columns=list('abcd'))
df

In [None]:
df.dropna(axis=1, how='any')  # axis=1: column

In [None]:
df.dropna(axis=0, how='all')  # axis=0: row

In [None]:
df.fillna('filled_by_this_value')

In [None]:
df = pd.DataFrame([[np.nan, 9, np.nan, 0], [2, 4, np.nan, 1],
                  [np.nan, np.nan, np.nan, 10]],
                  columns=list('abcd'))
df

In [None]:
values = {'a': 0, 'b': 1, 'c': 'Added it', 'd': 3}
df.fillna(value=values)

In [None]:
df.isnull().sum()

In [None]:
df.notnull().sum()

### 6. Add new column, delete columns

In [None]:
dics = [
    {'a': 10, 'b': 4, 'c': ''},
    {'a': 2, 'b': np.nan, 'c': 6},
    {'a': 3, 'b': 5, 'c': 7}
]
df = pd.DataFrame(dics)
df

In [None]:
# df.assign?  # to read documentation of df.assign

In [None]:
df =df.assign(ln_a = lambda x: np.log(x['a'])) # df['ln_a'] = np.log(df['a'])
df

In [None]:
_df = pd.qcut(df['a'], 2, labels=["good", "bad"])  # Quantile-based discretization function
                                                   # <=2: good, >2: bad
_df

In [None]:
df.max(axis=0)  # axis=1: column

In [None]:
df.max(axis=1)  # axis=1: index

In [None]:
df.min(axis=0)

In [None]:
df["ln_a"].clip(lower=0.5,upper=0.7)  # <0,5 --> 0.5; >0.7 --> 0.7

In [None]:
df["a"].abs()

In [None]:
# Delete using drop() 
df = df.drop(['a'], axis=1)
# Delete using del 
# del df['column_to_remove']
df

### 7. Reshape data

In [9]:
import pandas as pd
import numpy as np

dics = [
    {'a': 3, 'b': 4, 'c': '6'},
    {'a': 2, 'b': np.nan, 'c': 6},
    {'a': 3, 'b': 5, 'c': 7}
]
df = pd.DataFrame(dics)
df

Unnamed: 0,a,b,c
0,3,4.0,6
1,2,,6
2,3,5.0,7


In [10]:
# df.sort_values?

In [12]:
df.sort_values(['a', 'c'], ascending=[False, True], inplace=True)
df

Unnamed: 0,a,b,c
2,3,5.0,7
0,3,4.0,6
1,2,,6


In [13]:
df = df.rename(columns = {'b': 'BB','c':'CC'})
df

Unnamed: 0,a,BB,CC
2,3,5.0,7
0,3,4.0,6
1,2,,6


In [14]:
df.sort_index()

Unnamed: 0,a,BB,CC
0,3,4.0,6
1,2,,6
2,3,5.0,7


In [15]:
# df.reset_index?

In [16]:
df.drop(columns=['BB'], inplace=True)
df

Unnamed: 0,a,CC
2,3,7
0,3,6
1,2,6


In [None]:
# pd.melt? Unpivot: change column to row

In [17]:
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                   'B': {0: 1, 1: 3, 2: 5},
                   'C': {0: 2, 1: 4, 2: 6}})
df

Unnamed: 0,A,B,C
0,a,1,2
1,b,3,4
2,c,5,6


In [None]:
# Unpivot. change column B to rows
pd.melt(df, id_vars=['A'], value_vars=['B'])

In [18]:
df = pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])
df

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,a,C,2
4,b,C,4
5,c,C,6


In [19]:
df.groupby("variable").sum()

Unnamed: 0_level_0,value
variable,Unnamed: 1_level_1
B,9
C,12


In [20]:
df

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,a,C,2
4,b,C,4
5,c,C,6


In [21]:
df.groupby(['A', 'variable'])['value'].sum()

A  variable
a  B           1
   C           2
b  B           3
   C           4
c  B           5
   C           6
Name: value, dtype: int64

In [22]:
#### count duplicated values in one or more columns
df_count = df.groupby(['A']).size().reset_index(name='count')
df_count

Unnamed: 0,A,count
0,a,2
1,b,2
2,c,2


In [23]:
df

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,a,C,2
4,b,C,4
5,c,C,6


In [24]:
# Pivot: change row to colum
df.pivot(columns='variable', values='value')

variable,B,C
0,1.0,
1,3.0,
2,5.0,
3,,2.0
4,,4.0
5,,6.0


### 8. Concat

In [25]:
df1 = pd.DataFrame([['a', 1], ['b', 2]],
                   columns=['letter', 'number'])
df1

Unnamed: 0,letter,number
0,a,1
1,b,2


In [26]:
df2 = pd.DataFrame([['c', 3], ['d', 4]],
                   columns=['letter', 'number'])
df2

Unnamed: 0,letter,number
0,c,3
1,d,4


In [27]:
df = pd.concat([df1, df2])
df

Unnamed: 0,letter,number
0,a,1
1,b,2
0,c,3
1,d,4


### 9. Merge dataframes

In [28]:
df1 = pd.DataFrame({"x1": ["A", "B", "C"], "x2": [9, 99, 999]})
df1

Unnamed: 0,x1,x2
0,A,9
1,B,99
2,C,999


In [29]:
df2 = pd.DataFrame({"x1": ["A", "B", "D"], "x3": ["Test1", "Test2", "Test3"]})
df2

Unnamed: 0,x1,x3
0,A,Test1
1,B,Test2
2,D,Test3


In [30]:
df = pd.merge(df1, df2, how='left', on='x1')  # left_on and right_on if not the same column names
df

Unnamed: 0,x1,x2,x3
0,A,9,Test1
1,B,99,Test2
2,C,999,


In [31]:
df = pd.merge(df1, df2, how='right', on='x1')  #
df

Unnamed: 0,x1,x2,x3
0,A,9.0,Test1
1,B,99.0,Test2
2,D,,Test3


In [32]:
df = pd.merge(df1, df2, how='inner', on='x1')  #
df

Unnamed: 0,x1,x2,x3
0,A,9,Test1
1,B,99,Test2


In [33]:
df = pd.merge(df1, df2, how='outer', on='x1')  #
df

Unnamed: 0,x1,x2,x3
0,A,9.0,Test1
1,B,99.0,Test2
2,C,999.0,
3,D,,Test3


In [34]:
df1.x1.isin(df2.x1)  # isin([a list])

0     True
1     True
2    False
Name: x1, dtype: bool

In [35]:
df1[df1.x1.isin(df2.x1)]

Unnamed: 0,x1,x2
0,A,9
1,B,99


In [36]:
df1[~df1.x1.isin(df2.x1)]

Unnamed: 0,x1,x2
2,C,999


### 10. Rank

In [37]:
dic =  {"a" : [1 , 2, 1],
        "b" : [4, np.nan, 5],
        "c" : ['', 6, 7]}
df = pd.DataFrame(dic)
df

Unnamed: 0,a,b,c
0,1,4.0,
1,2,,6.0
2,1,5.0,7.0


In [40]:
df_rank = df["a"].rank(method='dense')  # method : {‘average’, ‘min’, ‘max’, ‘first’, ‘dense’}, default ‘average’
df_rank

0    1.0
1    2.0
2    1.0
Name: a, dtype: float64

In [41]:
df2 = pd.DataFrame(
[[4, 7, 10],
[5, 11, 8],
[6, 9, 12]],
index=[1, 2, 3],
columns=['a', 'b', 'c'])
df2

Unnamed: 0,a,b,c
1,4,7,10
2,5,11,8
3,6,9,12


In [42]:
df2.cumsum()   # cummulative sum for each column

Unnamed: 0,a,b,c
1,4,7,10
2,9,18,18
3,15,27,30


In [43]:
df2.cummax()

Unnamed: 0,a,b,c
1,4,7,10
2,5,11,10
3,6,11,12


In [44]:
df2.cummin()

Unnamed: 0,a,b,c
1,4,7,10
2,4,7,8
3,4,7,8


In [45]:
df2.cumprod()

Unnamed: 0,a,b,c
1,4,7,10
2,20,77,80
3,120,693,960


In [46]:
df2['b'].shift(2)

1    NaN
2    NaN
3    7.0
Name: b, dtype: float64

### 11. Moving average

In [51]:
import pandas as pd
dic =  {"a" : [1 , 2, 3],
        "b" : [4, np.nan, 5],
        "c" : ['', 6, 7]}
df = pd.DataFrame(dic)
df

Unnamed: 0,a,b,c
0,1,4.0,
1,2,,6.0
2,3,5.0,7.0


In [52]:
# Add column with Moving Average
df["ma"] = df.rolling(window=2)['a'].mean()
df

Unnamed: 0,a,b,c,ma
0,1,4.0,,
1,2,,6.0,1.5
2,3,5.0,7.0,2.5


In [None]:
# df.expanding?

### 12. set options

In [54]:
pd.set_option('display.max_rows', 1000000)

In [55]:
pd.reset_option('display.max_rows')

### 13. apply() function

In [64]:
import pandas as pd
df = pd.DataFrame({'ChemicalID': ['AA', 'BB', 'AA'],
                   'structureID': [ [{'k1': 'v1'}, {'k2':'v2'}], ['k99_v99'], [{'k3': 'v3'}] ],
                   'Other': ['Other1', 'Other2', 'Other1']
                  })
df

Unnamed: 0,ChemicalID,structureID,Other
0,AA,"[{'k1': 'v1'}, {'k2': 'v2'}]",Other1
1,BB,[k99_v99],Other2
2,AA,[{'k3': 'v3'}],Other1


In [65]:
df_reduce = df.groupby(['ChemicalID'])['structureID'].apply(sum).reset_index(name='structureID')
df_reduce

Unnamed: 0,ChemicalID,structureID
0,AA,"[{'k1': 'v1'}, {'k2': 'v2'}, {'k3': 'v3'}]"
1,BB,[k99_v99]


In [66]:
# Before merge delete a un-needed column
df.drop('structureID', axis=1, inplace=True)
print('df after delete 1 col = \n', df)

df after delete 1 col = 
   ChemicalID   Other
0         AA  Other1
1         BB  Other2
2         AA  Other1


In [67]:
cols_chosen = ['ChemicalID', 'Other']
print(cols_chosen)
# df_new = df.groupby(cols_chosen)['PDB_ID'].count()
# df.groupby('A')['C'].apply(lambda x: "{%s}" % ', '.join(x))
df1 = df.groupby(cols_chosen)['ChemicalID'].apply(lambda x: ''.join(x)).reset_index(name='PDB_ID')
df1

['ChemicalID', 'Other']


Unnamed: 0,ChemicalID,Other,PDB_ID
0,AA,Other1,AAAA
1,BB,Other2,BB


In [68]:
df = pd.DataFrame({'B': [1, 2, 3], 'C': [4, 5, 6]})
def color_negative_red(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for negative
    strings, black otherwise.
    """
    color = 'red' if val < 4 else 'black'
    return 'color: %s' % color

In [69]:
s = df.style.applymap(color_negative_red)
s

Unnamed: 0,B,C
0,1,4
1,2,5
2,3,6


### 14. Read/Write data

In [None]:
df = pd.read_csv('data_test.csv', header=None) # header included by default
df.to_csv('out.csv',index=False)
df = pd.read_excel('Excel.xlsx',sheetname='Sheet1')
df.to_excel('out.xlsx',sheet_name='NewSheet')
data = pd.read_html('a_link') # return a list