### Python

#### 星号用法

In [1]:
# The single star * unpacks the sequence/collection into positional arguments
# https://stackoverflow.com/questions/2921847/what-does-the-star-operator-mean
def sum(a, b):
    return a + b

values = (1, 2)
s = sum(*values) # ==> sum(1, 2)
print(s)

# The double star ** does the same, only using a dictionary and thus named arguments:
values = { 'a': 1, 'b': 2 }
ss = sum(**values) 
print(ss)

def sum(a, b, c, d):
    return a + b + c + d

values1 = (1, 2)
values2 = { 'c': 10, 'd': 15 }
sss = sum(*values1, **values2)
print(sss)

3
3
28


#### 多个list的合并

In [2]:
L=[[1,2,3],[4,5,6],[7,8,9]]
list1 = [ll for l in L for ll in l]
print(list1)

from itertools import chain
list2 = list(chain(*L))
print(list2)

[1, 2, 3, 4, 5, 6, 7, 8, 9]
[1, 2, 3, 4, 5, 6, 7, 8, 9]


### Pandas

In [3]:
import pandas as pd
import numpy as np

#### 改变dataframe中列的名字

In [4]:
d = {
        'one': [1, 2, 3, 4, 5],
        'two': [9, 8, 7, 6, 5],
        'three': ['a', 'b', 'c', 'd', 'e']
    }
df = pd.DataFrame(d)

names = df.columns.tolist()
names[names.index('two')] = 'new_name'
df.columns = names

print(df)

   one  new_name three
0    1         9     a
1    2         8     b
2    3         7     c
3    4         6     d
4    5         5     e


#### dataframe的赋值和copy

In [5]:
print(df)
df1 = df
df2 = df.copy()
df1.drop('one', axis=1, inplace=True)
print(df1)
print(df2)
print(df)

   one  new_name three
0    1         9     a
1    2         8     b
2    3         7     c
3    4         6     d
4    5         5     e
   new_name three
0         9     a
1         8     b
2         7     c
3         6     d
4         5     e
   one  new_name three
0    1         9     a
1    2         8     b
2    3         7     c
3    4         6     d
4    5         5     e
   new_name three
0         9     a
1         8     b
2         7     c
3         6     d
4         5     e


#### 对axis=0和axis=1的理解

In [6]:
df = pd.DataFrame([[1, 5, 1, 1], [2, 6, 2, 2], [3, 7, 3, 3]], columns=["col1", "col2", "col3", "col4"])
print(df)
print(df.mean(axis=1))
print(df.drop('col4', axis=1))

# 理解axis有问题，df.mean其实是在每一行上取所有列的均值，而不是保留每一列的均值。
# 也许简单的来记就是axis=0代表往跨行（down)，而axis=1代表跨列（across)

# 换句话说:
# 使用0值表示沿着每一列或行标签\索引值向下执行方法
# 使用1值表示沿着每一行或者列标签模向执行对应的方法

   col1  col2  col3  col4
0     1     5     1     1
1     2     6     2     2
2     3     7     3     3
0    2.0
1    3.0
2    4.0
dtype: float64
   col1  col2  col3
0     1     5     1
1     2     6     2
2     3     7     3


In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale

scaler1 = StandardScaler()
print(scaler1.fit(df))
print(scaler1.mean_)
print(scaler1.transform(df))

StandardScaler(copy=True, with_mean=True, with_std=True)
[2. 6. 2. 2.]
[[-1.22474487 -1.22474487 -1.22474487 -1.22474487]
 [ 0.          0.          0.          0.        ]
 [ 1.22474487  1.22474487  1.22474487  1.22474487]]


In [8]:
scaler2 = scale(df, axis=0)  # axis=0 by default
print(scaler2)
scaler2 = scale(df, axis=1)
print(scaler2)

[[-1.22474487 -1.22474487 -1.22474487 -1.22474487]
 [ 0.          0.          0.          0.        ]
 [ 1.22474487  1.22474487  1.22474487  1.22474487]]
[[-0.57735027  1.73205081 -0.57735027 -0.57735027]
 [-0.57735027  1.73205081 -0.57735027 -0.57735027]
 [-0.57735027  1.73205081 -0.57735027 -0.57735027]]


#### 改变dataframe中column的顺序

In [9]:
df = pd.DataFrame([[1, 5, 1, 1], [2, 6, 2, 2], [3, 7, 3, 3]], columns=["col1", "col2", "col3", "col4"])
df

Unnamed: 0,col1,col2,col3,col4
0,1,5,1,1
1,2,6,2,2
2,3,7,3,3


In [10]:
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]

In [11]:
df = df[cols]
df

Unnamed: 0,col4,col1,col2,col3
0,1,1,5,1
1,2,2,6,2
2,3,3,7,3


#### 选择dataframe的不连续列

In [12]:
# https://stackoverflow.com/questions/50143469/pandas-dataframe-select-multiple-discontinuous-columns-slices?noredirect=1&lq=1
df = pd.DataFrame([[1, 5, 1, 1], [2, 6, 2, 2], [3, 7, 3, 3]], columns=["col1", "col2", "col3", "col4"])

df.iloc[:, np.r_[0:2, 3]]

Unnamed: 0,col1,col2,col4
0,1,5,1
1,2,6,2
2,3,7,3


#### 计算dataframe某列和其他列的相关系数

In [13]:
df = pd.DataFrame([[1, 4, 3, 8], [2, 5, 2, 7], [3, 6, 10, 6]], columns=["col1", "col2", "col3", "col4"])
print(df.corr())
corr = df.corr()['col1']
corr

          col1      col2      col3      col4
col1  1.000000  1.000000  0.802955 -1.000000
col2  1.000000  1.000000  0.802955 -1.000000
col3  0.802955  0.802955  1.000000 -0.802955
col4 -1.000000 -1.000000 -0.802955  1.000000


col1    1.000000
col2    1.000000
col3    0.802955
col4   -1.000000
Name: col1, dtype: float64

#### 删除满足条件的行

In [14]:
df = pd.DataFrame([[1, 5, 1, 1], [2, 6, 2, 2], [3, 7, 3, 3]], columns=["col1", "col2", "col3", "col4"])
c = [1, 2]
print(df)
df[df['col1'].isin(c)]

   col1  col2  col3  col4
0     1     5     1     1
1     2     6     2     2
2     3     7     3     3


Unnamed: 0,col1,col2,col3,col4
0,1,5,1,1
1,2,6,2,2


#### 按列合并两个dataframe

In [15]:
df1 = pd.DataFrame({'a': [1, 1, 1, 2, 2, 2],
                    'b': [1, 2, 3, 1, 2, 3],
                    'c': [8, 8, 8, 8, 8, 8],
                    'd': [9, 9, 9, 9, 9, 9]})
print(df1)
df2 = pd.DataFrame({'a': [1, 1, 1, 1, 2, 2, 2],
                    'b': [1, 2, 3, 4, 1, 2, 3],
                    'e': [8, 8, 8, 8, 8, 8, 8],
                    'f': [9, 9, 9, 9, 9, 9, 9]})
print(df2)

t = pd.merge(df1, df2, on=['a', 'b'], how='outer')
print(t)

   a  b  c  d
0  1  1  8  9
1  1  2  8  9
2  1  3  8  9
3  2  1  8  9
4  2  2  8  9
5  2  3  8  9
   a  b  e  f
0  1  1  8  9
1  1  2  8  9
2  1  3  8  9
3  1  4  8  9
4  2  1  8  9
5  2  2  8  9
6  2  3  8  9
   a  b    c    d  e  f
0  1  1  8.0  9.0  8  9
1  1  2  8.0  9.0  8  9
2  1  3  8.0  9.0  8  9
3  2  1  8.0  9.0  8  9
4  2  2  8.0  9.0  8  9
5  2  3  8.0  9.0  8  9
6  1  4  NaN  NaN  8  9


### Numpy

#### 找出值为NA的位置

In [16]:
x = np.random.random((5,3))
x[0,1] = np.nan
x[4,2] = np.nan
print(x)

y = np.where(np.isnan(x))
x[y]

[[0.15542912        nan 0.50654312]
 [0.63508231 0.68601154 0.12920556]
 [0.02636353 0.53063566 0.51646592]
 [0.69764116 0.58714407 0.02450646]
 [0.10740189 0.42796334        nan]]


array([nan, nan])

#### np.newaxis使用

In [17]:
# np.newaxis 在使用和功能上等价于 None，为numpy.ndarray（多维数组）增加一个轴
x = np.arange(9)
print(x)
print(x.shape)
y = x[:, np.newaxis]
print(y.shape)
print(y)
z = x[:, None]
print(z.shape)
zz = x[np.newaxis, :]
print(zz)

[0 1 2 3 4 5 6 7 8]
(9,)
(9, 1)
[[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]]
(9, 1)
[[0 1 2 3 4 5 6 7 8]]
