### Python

#### 星号用法

In [1]:
# The single star * unpacks the sequence/collection into positional arguments
# https://stackoverflow.com/questions/2921847/what-does-the-star-operator-mean
def sum(a, b):
    return a + b

values = (1, 2)
s = sum(*values) # ==> sum(1, 2)
print(s)

# The double star ** does the same, only using a dictionary and thus named arguments:
values = { 'a': 1, 'b': 2 }
ss = sum(**values) 
print(ss)

def sum(a, b, c, d):
    return a + b + c + d

values1 = (1, 2)
values2 = { 'c': 10, 'd': 15 }
sss = sum(*values1, **values2)
print(sss)

3
3
28


#### 多个list的合并

In [2]:
L=[[1,2,3],[4,5,6],[7,8,9]]
list1 = [ll for l in L for ll in l]
print(list1)

from itertools import chain
list2 = list(chain(*L))
print(list2)

[1, 2, 3, 4, 5, 6, 7, 8, 9]
[1, 2, 3, 4, 5, 6, 7, 8, 9]


### Pandas

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#### describe用法

In [4]:
tips = sns.load_dataset("tips")
tips.describe(include='all')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
count,244.0,244.0,244,244,244,244,244.0
unique,,,2,2,4,2,
top,,,Male,No,Sat,Dinner,
freq,,,157,151,87,176,
mean,19.785943,2.998279,,,,,2.569672
std,8.902412,1.383638,,,,,0.9511
min,3.07,1.0,,,,,1.0
25%,13.3475,2.0,,,,,2.0
50%,17.795,2.9,,,,,2.0
75%,24.1275,3.5625,,,,,3.0


#### 改变dataframe中列的名字

In [5]:
d = {
        'one': [1, 2, 3, 4, 5],
        'two': [9, 8, 7, 6, 5],
        'three': ['a', 'b', 'c', 'd', 'e']
    }
df = pd.DataFrame(d)

names = df.columns.tolist()
names[names.index('two')] = 'new_name'
df.columns = names

print(df)

# 或者
df = df.rename(columns={'three': 'new_name2'})

   one  new_name three
0    1         9     a
1    2         8     b
2    3         7     c
3    4         6     d
4    5         5     e


#### dataframe的赋值和copy

In [6]:
print(df)
df1 = df
df2 = df.copy()
df1.drop('one', axis=1, inplace=True)
print(df1)
print(df2)
print(df)

   one  new_name new_name2
0    1         9         a
1    2         8         b
2    3         7         c
3    4         6         d
4    5         5         e
   new_name new_name2
0         9         a
1         8         b
2         7         c
3         6         d
4         5         e
   one  new_name new_name2
0    1         9         a
1    2         8         b
2    3         7         c
3    4         6         d
4    5         5         e
   new_name new_name2
0         9         a
1         8         b
2         7         c
3         6         d
4         5         e


#### 改变dataframe中column的顺序

In [7]:
df = pd.DataFrame([[1, 5, 1, 1], [2, 6, 2, 2], [3, 7, 3, 3]], columns=["col1", "col2", "col3", "col4"])
df

Unnamed: 0,col1,col2,col3,col4
0,1,5,1,1
1,2,6,2,2
2,3,7,3,3


In [8]:
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]

In [9]:
df = df[cols]
df

Unnamed: 0,col4,col1,col2,col3
0,1,1,5,1
1,2,2,6,2
2,3,3,7,3


#### 选择dataframe的不连续列

In [10]:
# https://stackoverflow.com/questions/50143469/pandas-dataframe-select-multiple-discontinuous-columns-slices?noredirect=1&lq=1
df = pd.DataFrame([[1, 5, 1, 1], [2, 6, 2, 2], [3, 7, 3, 3]], columns=["col1", "col2", "col3", "col4"])

df.iloc[:, np.r_[0:2, 3]]

Unnamed: 0,col1,col2,col4
0,1,5,1
1,2,6,2
2,3,7,3


#### 计算dataframe某列和其他列的相关系数

In [11]:
df = pd.DataFrame([[1, 4, 3, 8], [2, 5, 2, 7], [3, 6, 10, 6]], columns=["col1", "col2", "col3", "col4"])
print(df.corr())
corr = df.corr()['col1']
corr

          col1      col2      col3      col4
col1  1.000000  1.000000  0.802955 -1.000000
col2  1.000000  1.000000  0.802955 -1.000000
col3  0.802955  0.802955  1.000000 -0.802955
col4 -1.000000 -1.000000 -0.802955  1.000000


col1    1.000000
col2    1.000000
col3    0.802955
col4   -1.000000
Name: col1, dtype: float64

#### 删除满足条件的行

In [12]:
df = pd.DataFrame([[1, 5, 1, 1], [2, 6, 2, 2], [3, 7, 3, 3]], columns=["col1", "col2", "col3", "col4"])
c = [1, 2]
print(df)
df[df['col1'].isin(c)]

   col1  col2  col3  col4
0     1     5     1     1
1     2     6     2     2
2     3     7     3     3


Unnamed: 0,col1,col2,col3,col4
0,1,5,1,1
1,2,6,2,2


#### 按列合并两个dataframe

In [13]:
df1 = pd.DataFrame({'a': [1, 1, 1, 2, 2, 2],
                    'b': [1, 2, 3, 1, 2, 3],
                    'c': [8, 8, 8, 8, 8, 8],
                    'd': [9, 9, 9, 9, 9, 9]})
print(df1)
df2 = pd.DataFrame({'a': [1, 1, 1, 1, 2, 2, 2],
                    'b': [1, 2, 3, 4, 1, 2, 3],
                    'e': [8, 8, 8, 8, 8, 8, 8],
                    'f': [9, 9, 9, 9, 9, 9, 9]})
print(df2)

t = pd.merge(df1, df2, on=['a', 'b'], how='outer')
print(t)

   a  b  c  d
0  1  1  8  9
1  1  2  8  9
2  1  3  8  9
3  2  1  8  9
4  2  2  8  9
5  2  3  8  9
   a  b  e  f
0  1  1  8  9
1  1  2  8  9
2  1  3  8  9
3  1  4  8  9
4  2  1  8  9
5  2  2  8  9
6  2  3  8  9
   a  b    c    d  e  f
0  1  1  8.0  9.0  8  9
1  1  2  8.0  9.0  8  9
2  1  3  8.0  9.0  8  9
3  2  1  8.0  9.0  8  9
4  2  2  8.0  9.0  8  9
5  2  3  8.0  9.0  8  9
6  1  4  NaN  NaN  8  9


#### 改变dataframe中某个元素的值

In [14]:
df = pd.DataFrame([[1, 5, 1, 1], [2, 6, 2, 2], [3, 7, 3, 3]], columns=["col1", "col2", "col3", "col4"])
df

Unnamed: 0,col1,col2,col3,col4
0,1,5,1,1
1,2,6,2,2
2,3,7,3,3


In [15]:
# This does not work
df[df['col1']==1]['col2'] = 999
print(df)

# This works
df.loc[df['col1']==1, 'col2'] = 999
print(df)

   col1  col2  col3  col4
0     1     5     1     1
1     2     6     2     2
2     3     7     3     3
   col1  col2  col3  col4
0     1   999     1     1
1     2     6     2     2
2     3     7     3     3


#### 找出行列出现频率最高的值

In [16]:
df = pd.DataFrame([[1, 5, 1, 1], [1, 6, 2, 2], [3, 7, 3, 3]], columns=["col1", "col2", "col3", "col4"])
print(df)
print(df['col1'].value_counts().idxmax())
print(df.iloc[0].value_counts().idxmax())

   col1  col2  col3  col4
0     1     5     1     1
1     1     6     2     2
2     3     7     3     3
1
1


### Numpy

#### 找出值为NA的位置

In [17]:
x = np.random.random((5,3))
x[0,1] = np.nan
x[4,2] = np.nan
print(x)

y = np.where(np.isnan(x))
x[y]

[[0.73881668        nan 0.40006361]
 [0.63644186 0.8834226  0.96823432]
 [0.86137703 0.77400244 0.60982026]
 [0.48235478 0.68150567 0.83684891]
 [0.95784686 0.02054287        nan]]


array([nan, nan])

#### axis的方向

In [18]:
d = np.arange(6).reshape(2,3)
columns = ['col1', 'col2', 'col3']
df = pd.DataFrame(d, columns=columns)
df

Unnamed: 0,col1,col2,col3
0,0,1,2
1,3,4,5


In [19]:
# axis=0表示沿纵向，axis=1沿横向
x = df.sum(axis=0)
y = df.sum(axis=1)
print(x)
print(y)
m = df.mean(axis=0)
n = df.mean(axis=1)
print(m)
print(n)

col1    3
col2    5
col3    7
dtype: int64
0     3
1    12
dtype: int64
col1    1.5
col2    2.5
col3    3.5
dtype: float64
0    1.0
1    4.0
dtype: float64


In [20]:
df.drop('col1', axis=1)

Unnamed: 0,col2,col3
0,1,2
1,4,5


In [21]:
# 但是axis的取值不能超过数组维数
d = np.array([1, 2, 3])
d.mean(axis=0)

2.0

In [22]:
# 更加复杂的例子
data = np.arange(18).reshape(2,3,3)
data

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]]])

In [23]:
# 对axis某方向运算，计算结果维数就是将该方向维数去掉
print(data.sum(axis=0))
print(data.mean(axis=0))
print(data.sum(axis=1))
print(data.mean(axis=1))
print(data.sum(axis=2))
print(data.mean(axis=2))

[[ 9 11 13]
 [15 17 19]
 [21 23 25]]
[[ 4.5  5.5  6.5]
 [ 7.5  8.5  9.5]
 [10.5 11.5 12.5]]
[[ 9 12 15]
 [36 39 42]]
[[ 3.  4.  5.]
 [12. 13. 14.]]
[[ 3 12 21]
 [30 39 48]]
[[ 1.  4.  7.]
 [10. 13. 16.]]


#### np.newaxis使用

In [24]:
# np.newaxis 在使用和功能上等价于 None，为numpy.ndarray（多维数组）增加一个轴
x = np.arange(9)
print(x)
print(x.shape)
y = x[:, np.newaxis]
print(y.shape)
print(y)
z = x[:, None]
print(z.shape)
zz = x[np.newaxis, :]
print(zz)

[0 1 2 3 4 5 6 7 8]
(9,)
(9, 1)
[[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]]
(9, 1)
[[0 1 2 3 4 5 6 7 8]]
