Pandas

[pandas官方文档](https://pandas.pydata.org/docs/user_guide/index.html)

In [6]:
import numpy as np
import pandas as pd
import json

# Series和DataFrame

## Series

In [4]:
sr = pd.Series([1,2,3])
sr

0    1
1    2
2    3
dtype: int64

### Series操作

In [42]:
sr[0]

1

In [41]:
sr[[0,1]]

0    1
1    2
dtype: int64

## DataFrame

In [8]:
df1 = pd.DataFrame([
    [1,2], # 第一行
    [3,4], # 第二行数据
    [5,6]
])
df1

Unnamed: 0,0,1
0,1,2
1,3,4
2,5,6


In [9]:
df1[0] # index变成以列为维度。一般来说df中都会定义column。这种不定义的情况较少

0    1
1    3
2    5
Name: 0, dtype: int64

In [10]:
df1[[0,1]]#这个地方也是column

Unnamed: 0,0,1
0,1,2
1,3,4
2,5,6


In [11]:
df2 = pd.DataFrame([
    [1,2],
    [3,4]
],columns=['a','b'])
df2

Unnamed: 0,a,b
0,1,2
1,3,4


In [12]:
df2.index

RangeIndex(start=0, stop=2, step=1)

In [13]:
df2.columns

Index(['a', 'b'], dtype='object')

In [14]:
df2.values # 二维数组

array([[1, 2],
       [3, 4]])

In [15]:
df2.dtypes

a    int64
b    int64
dtype: object

### DataFrame操作

In [16]:
#df2[0] # dataframe不能用数字索引,除非定义的columns是数字

In [86]:
df2['b'] #选择b列 返回series

0    2
1    4
Name: b, dtype: int64

In [87]:
df2[['a','b']] # 选取两列

Unnamed: 0,a,b
0,1,2
1,3,4


In [17]:
df2[[True,False]] # 布尔索引是按行来索引的

Unnamed: 0,a,b
0,1,2


In [56]:
df2[0:1] # 选取索引0-1 不包含1 

Unnamed: 0,a,b
0,1,2


In [18]:
###两个括号就是维持原来的dim. 如果一个括号就会降维。 但是也要注意本身括号里的是不是一个list-like

df2[ df2['a']>2 ]

df2[ (df2['a']>0) & (df2['b']>0) ]#如果两个条件要加括号

df2[ ~(df2['a']>2)] #取反也要记得加括号

Unnamed: 0,a,b
0,1,2


In [19]:
df3 = pd.DataFrame([
    [1,2],
    [3,4]
],columns=['a','b'],index=['A','B'])
df3

Unnamed: 0,a,b
A,1,2
B,3,4


In [20]:
df3['A':'B'] # 如果不是数字索引，那么是两端都包含

Unnamed: 0,a,b
A,1,2
B,3,4


In [89]:
## df[[col1]]和df[col1]结果都是一列 但是他们两性质不一样  前者还是返回dataframe 后者是series

# 数据存储与读取

## CSV

### read_csv()

In [41]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/350_0af71498f139412cb602f67a18371098.csv",encoding='utf-8-sig')
#df.info()
df.head(1)

Unnamed: 0,csv_batch_id,project_id,lang,file_id,session_id,person_info_id,audio_url,origin_duration,pre_content_text,pre_audio_type,pre_regions
0,350,3532,Urdu,18060742,115928,3374,appen://3532_Ake__URD_IND/Staging/AMR_DATA/Ake...,5.4,تم کو عضو کا استعمال کب سے جانتے ہو,TYKY,"[{""start"": 1.405, ""end"": 4.805}]"


In [36]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/350_0af71498f139412cb602f67a18371098.csv",
                 index_col = "file_id",
                 encoding='utf-8-sig')
df.loc[18060742]  # loc by index
# 被设置成index 将不在输出字段内
df.loc[18060742].to_dict()

{'csv_batch_id': 350,
 'project_id': 3532,
 'lang': 'Urdu',
 'session_id': 115928,
 'person_info_id': 3374,
 'audio_url': 'appen://3532_Ake__URD_IND/Staging/AMR_DATA/Ake_0Urdu_URD_IND_URD24071_20220111-060628/Ake_0Urdu_URD_IND_URD24071_20220111-060628_0056_RDTYKY000480_01.wav',
 'origin_duration': 5.4,
 'pre_content_text': 'تم کو عضو کا استعمال کب سے جانتے ہو',
 'pre_audio_type': 'TYKY',
 'pre_regions': '[{"start": 1.405, "end": 4.805}]'}

In [40]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/350_0af71498f139412cb602f67a18371098.csv",
                 usecols= ["project_id","lang"],
                 encoding='utf-8-sig')
df.head(1)

Unnamed: 0,project_id,lang
0,3532,Urdu


### 扩展dtypes

#### 默认

In [186]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/350_0af71498f139412cb602f67a18371098.csv",
                 encoding='utf-8-sig')
df.dtypes

csv_batch_id          int64
project_id            int64
lang                 object
file_id               int64
session_id            int64
person_info_id        int64
audio_url            object
origin_duration     float64
pre_content_text     object
pre_audio_type       object
pre_regions          object
dtype: object

In [42]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df ## pandas会自动判断类型，可以指定类型
df.dtypes

text_content    object
flag             int64
full_text       object
dtype: object

#### 根据值做可能转换

In [59]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df =df.convert_dtypes()
df.dtypes # object被转成了string

text_content    string
flag             Int64
full_text       string
dtype: object

#### 读取文件的时候指定dtype

In [56]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv",
                dtype={"flag":str,"text_content":"string"})
df
# df.dtypes ## 注意text_content是str类型

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


In [57]:
df.dtypes ## 注意text_content是str类型

text_content    string
flag            object
full_text       object
dtype: object

In [58]:
df["text_content"]

0                              this is a test line one
1    this is a test line two, this is a test line t...
2                                                 <NA>
3                                                 <NA>
Name: text_content, dtype: string

### to_csv()

In [246]:
df.to_csv(
    "test_01.csv",
    sep=",",#默认
    encoding='utf-8-sig', # 默认'utf-8'，
    index=False # 默认是true, 一般记得要加
)

## Excel

### read_excel()

In [None]:
pd.read_excel(
    io,"io 路径或者url"
    sheet_name=0,"默认是第一个sheet  可以是int,str,list,None None是全部 list和None返回的是字典"
    header=0,"默认第一行，列索引 如果没有表头，指定header=None"
    index_col=None,'默认没有'
    usecols=None,"默认拿出所有的列, 几种方式A:C, A,C [0,2] ['AAA','CCC] lambda i:i=='AAA' "
    skiprows=None, #跳过行 skiprows=1跳过的行数，或者 shiprows=[0,2]，
    names = ['a','b','c'] # header是None的时候，指定names,否则会替换掉第一行的数据。
    dtype={'a':'str'}# 注意都是字符串
    parse_dates=False # True 尝试解析 [0,1] [[0,1,2]] ['a','b']
    date_parser=None #function 日期解析函数 为None的时候会使用内部的解析器解析.
    na_values = None,#'a' #'NA' ['NA','#NA','0',0],
    converters=None #{"a":lambda i:i+1},
    mangle_dupe_cols=True,"默认允许重复并且会重命名重复的列 不能设置成False 还不支持" 
)

In [11]:
# excel_df = pd.read_excel("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/email_update.xlsx")
# excel_df.head()
# 如果要读excel 需要安装 xlrd和xlwt

In [12]:
#df = pd.read_excel("test.xlsx", sheet_name=["Sheet1", "Sheet3"]) 可以一次获取多个sheet

### to_excel()

## SQL

In [None]:
import psycopg2
conn = psycopg2.connect(host=RDS_DATA_DB_HOST, user=RDS_DATA_DB_USER,
                            password=RDS_DATA_DB_PSW, database=RDS_DATA_DB_NAME,
                                port=RDS_DATA_DB_PORT)

### read_sql()

In [None]:
df = pd.read_sql(sql, conn)
conn.close()
#df.to_xx


### to_sql()

In [None]:
engine = create_engine(
        f'postgresql+psycopg2://{RDS_DATA_DB_USER}:{RDS_DATA_DB_PSW}@{RDS_DATA_DB_HOST}:{RDS_DATA_DB_PORT}/{RDS_DATA_DB_NAME}',
        poolclass=NullPool,
        connect_args={'connect_timeout': 30}
    )

df.to_sql('nums', con=engine,index=False,if_exists="append") # nums是表名，con
### 实际的工作中，尽量使用append 不要使用replace,或者fail

## json

### read_json

In [6]:
# records
'''
[
  {
    "project_id": "531",
    "ss_folder": "Ake_P15924_ARA_EGY_QN377998_20210725-101610",
    "person_id": "QN377998",
    "audio_url": "appen://531_Ake__ARA_EYG/.../xx.wav",
    "audio_type": "SZSJ",
    "lang": "Arabic(Egypt)",
    "valid_duration": 3.986,
    "content_text": "لقد إشتريت ثلاث طائرات وست سفن حربية.",
    "audio_start": "1.548",
    "audio_end": "5.534",
    "pid": 741693,
    "age": 19,
    "gender": "M",
    "submit_date": "20220124",
    "pack_audio_name": "ArabicEgypt_abqfer_M_19_0949.wav",
    "is_pack": 1,
    "pack_path": "/appen-delivery/wooey_rs/ake/1e09bae1ecdd4360b94c762846075c02"
  },
  ...
]
'''
df1 = pd.read_json("http://projecteng.oss-cn-shanghai.aliyuncs.com/0_ProjectData/data_bak/0f207f4f16e34c4caf7b576ab8e535b7.json")
df1.head(1)

Unnamed: 0,project_id,ss_folder,person_id,audio_url,audio_type,lang,valid_duration,content_text,audio_start,audio_end,pid,age,gender,submit_date,pack_audio_name,is_pack,pack_path
0,531,Ake_P15924_ARA_EGY_QN377998_20210725-101610,QN377998,appen://531_Ake__ARA_EYG/Staging/AMR_DATA/Ake_...,SZSJ,Arabic(Egypt),3.986,لقد إشتريت ثلاث طائرات وست سفن حربية.,1.548,5.534,741693,19,M,20220124,/appen-delivery/wooey_rs/ake/1e09bae1ecdd4360b...,1,/appen-delivery/wooey_rs/ake/1e09bae1ecdd4360b...


In [None]:
#split
'''
{
    "columns":["col 1","col 2"],
    "index":["row 1","row 2"],
    "data":[["a","b"],["c","d"]]
}
'''

In [None]:
#index
'''
{ 
    "row 1":{"col 1":"a","col 2":"b"},
    "row 2":{"col 1":"c","col 2":"d"}
}
'''

### to_json

In [None]:
import json

In [33]:
df = pd.DataFrame(
    [["a", "b"], ["c", "d"]],
    index=["row 1", "row 2"],
    columns=["col 1", "col 2"],
)
df.to_json(orient="split")

'{"columns":["col 1","col 2"],"index":["row 1","row 2"],"data":[["a","b"],["c","d"]]}'

In [32]:
df.to_json(orient="records")

'[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'

In [31]:
df.to_json(orient="index")

'{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'

In [36]:
df.to_json(orient="columns")

'{"col 1":{"row 1":"a","row 2":"c"},"col 2":{"row 1":"b","row 2":"d"}}'

In [37]:
df.to_json(orient="values")

'[["a","b"],["c","d"]]'

In [38]:
df.to_json(orient="table")

'{"schema":{"fields":[{"name":"index","type":"string"},{"name":"col 1","type":"string"},{"name":"col 2","type":"string"}],"primaryKey":["index"],"pandas_version":"0.20.0"},"data":[{"index":"row 1","col 1":"a","col 2":"b"},{"index":"row 2","col 1":"c","col 2":"d"}]}'

### json_normalize

In [12]:
data = [
    {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
    {"name": {"given": "Mark", "family": "Regner"}},
    {"id": 2, "name": "Faye Raker"},
]
pd.json_normalize(data)
## 会自动创建每个字段， 内层的字段通过.拼接

Unnamed: 0,id,name.first,name.last,name.given,name.family,name
0,1.0,Coleen,Volk,,,
1,,,,Mark,Regner,
2,2.0,,,,,Faye Raker


In [16]:
data = [
    {
        "id": 1,
        "name": "Cole Volk",
        "fitness": {"height": 130, "weight": 60},
    },
    {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
    {
        "id": 2,
        "name": "Faye Raker",
        "fitness": {"height": 130, "weight": 60},
    },
]
pd.json_normalize(data, max_level=0) # max_level如果是0 那么解析第一层

Unnamed: 0,id,name,fitness
0,1.0,Cole Volk,"{'height': 130, 'weight': 60}"
1,,Mark Reg,"{'height': 130, 'weight': 60}"
2,2.0,Faye Raker,"{'height': 130, 'weight': 60}"


In [20]:
data = [
    {
        "state": "Florida",
        "shortname": "FL",
        "info": {"governor": "Rick Scott"},
        "counties": [
            {"name": "Dade", "population": 12345},
            {"name": "Broward", "population": 40000},
            {"name": "Palm Beach", "population": 60000},
        ],
    },
    {
        "state": "Ohio",
        "shortname": "OH",
        "info": {"governor": "John Kasich"},
        "counties": [
            {"name": "Summit", "population": 1234},
            {"name": "Cuyahoga", "population": 1337},
        ],
    },
]
# pd.json_normalize(
#     data
# )

## 以counties 作为解析的行
pd.json_normalize(
    data, "counties", ["state", "shortname", ["info", "governor"]]
)

Unnamed: 0,name,population,state,shortname,info.governor
0,Dade,12345,Florida,FL,Rick Scott
1,Broward,40000,Florida,FL,Rick Scott
2,Palm Beach,60000,Florida,FL,Rick Scott
3,Summit,1234,Ohio,OH,John Kasich
4,Cuyahoga,1337,Ohio,OH,John Kasich


In [26]:
data = [{"A": [[1, 2],[3,4]]}]
pd.json_normalize(data, "A", record_prefix="Prefix.")

Unnamed: 0,Prefix.0,Prefix.1
0,1,2
1,3,4


## Table

### read_table()

## to_list

In [296]:
sr = pd.Series([1,2,3])
sr_lst = sr.tolist()
print(type(sr),type(sr_lst))

<class 'pandas.core.series.Series'> <class 'list'>


In [299]:
df2 = pd.DataFrame([
    [1,2],
    [3,4]
],columns=['a','b'])
# df2.tolist()
# df没有to_list

## to_dict

In [302]:
df2

Unnamed: 0,a,b
0,1,2
1,3,4


In [312]:
df2.to_dict() # index 为key,值为series 转还后的 dict

{'a': {0: 1, 1: 3}, 'b': {0: 2, 1: 4}}

In [311]:
type(df2.to_dict()["a"])

dict

In [318]:
df2.to_dict(orient="list")  

{'a': [1, 3], 'b': [2, 4]}

In [313]:
df2.to_dict(orient="dict")  # 默认行为

{'a': {0: 1, 1: 3}, 'b': {0: 2, 1: 4}}

In [317]:
df2.to_dict(orient="series")  

{'a': 0    1
 1    3
 Name: a, dtype: int64,
 'b': 0    2
 1    4
 Name: b, dtype: int64}

In [316]:
df2.to_dict(orient="records")   # 常用

[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]

In [320]:
df2.to_dict(orient="index")   # key是index

{0: {'a': 1, 'b': 2}, 1: {'a': 3, 'b': 4}}

# 常用

## isnull(isna)和notnull(notna)

In [248]:
pd.isnull == pd.isna

True

In [249]:
pd.notnull == pd.notna

True

In [250]:
pd.isnull(np.nan)
# 同pd.isnull(np.NaN)

True

In [101]:
pd.isnull(pd.NA)

True

In [102]:
pd.isnull(pd.NaT) # 日期空只在pd中定义

True

In [103]:
pd.isnull(None)

True

### 从数据结构加载

In [251]:
sr = pd.Series(["a","b",1,None],dtype="string") 
sr

sr.isnull()

0    False
1    False
2    False
3     True
dtype: bool

In [253]:
sr[3] is pd.NA # True 

# 不要使用 == 判断
print("na", sr[3] == pd.NA,type(sr[3] == pd.NA)) #na <NA> <class 'pandas._libs.missing.NAType'>
pd.isnull(sr[3]) # True

na <NA> <class 'pandas._libs.missing.NAType'>


True

In [127]:
sr2 = pd.Series(["a","b",1,None],dtype="str")
sr2 # dtype object 
pd.isnull(sr[3]) # True

True

In [123]:
sr2.isnull()  # 依然有效

0    False
1    False
2    False
3     True
dtype: bool

In [209]:
sr3 = pd.Series(["a","b",1,None],dtype="string") 

In [None]:
sr3.isnull() # None也是可以被isnull识别

0    False
1    False
2    False
3     True
dtype: bool

In [178]:
sr4 =  pd.Series(["a","b",1,None]) 
sr4 # dtype object

0       a
1       b
2       1
3    None
dtype: object

In [185]:
sr4 = sr4.convert_dtypes()
sr4

0       a
1       b
2       1
3    None
dtype: object

In [48]:
sr5 =  pd.Series(["a","b","c",None]) 
sr5 # dtype object
sr5 = sr5.convert_dtypes()  # 专成了string
#type(sr5[3])

### 时间类型

In [139]:
st1 = pd.Series([pd.to_datetime('2018-09-08'),None],dtype="datetime64[ns]") 
st1

0   2018-09-08
1          NaT
dtype: datetime64[ns]

In [140]:
st1.isnull() # is null也是用于pd.NaT类型

0    False
1     True
dtype: bool

### 从csv加载数据

#### 读取的时候指定

In [49]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv",dtype={"text_content":"str"})
df

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


In [227]:
df.isnull()

Unnamed: 0,text_content,flag,full_text
0,False,False,False
1,False,False,False
2,True,False,False
3,True,False,False


In [50]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv",dtype={"text_content":"string"})
df

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


In [229]:
df.isnull()

Unnamed: 0,text_content,flag,full_text
0,False,False,False
1,False,False,False
2,True,False,False
3,True,False,False


#### astype事后转换 string和str存在差异

In [230]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df["text_content"] = df["text_content"].astype("str")

In [233]:
df.isnull() #astype str会强转

Unnamed: 0,text_content,flag,full_text
0,False,False,False
1,False,False,False
2,True,False,False
3,True,False,False


In [4]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df["text_content"] = df["text_content"].astype("string") # 如果事后再转换类型，会强转字符串类型
df

#type(df["text_content"][3])

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


In [55]:
df.isnull()  # string会保留null值
#type(df["text_content"][3])  # pandas._libs.missing.NAType

Unnamed: 0,text_content,flag,full_text
0,False,False,False
1,False,False,False
2,True,False,False
3,True,False,False


In [254]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df["text_content"] = df["text_content"].astype("string") # 在老本版这样用可能会报错。 在老version中要先转“str” 再转string

In [9]:
df["text_content"] = df["text_content"].astype("str").astype("string") # 如果事后再转换类型，会强转字符串类型
#type(df["text_content"][3])
df["text_content"][3]

'<NA>'

## count和len

### count

In [268]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df
# df.count().to_dict()  # {'text_content': 2, 'flag': 4, 'full_text': 4}
# df["text_content"].count() # 2

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


In [270]:
sr1 =  pd.Series(["a","b","c",None]) 
print("sr1",sr1.count()) 

sr2 = pd.Series(["a","b","c",None]).astype("string")
print("sr2",sr2.count())

sr3 = pd.Series(["a","b","c",None]).astype("str")
print("sr3",sr3.count())

sr1 3
sr2 3
sr3 4


### len

In [271]:
sr4 =  pd.Series(["a","b","c",None]) 
print("sr4",len(sr4))

sr5 = pd.Series(["a","b","c",None]).astype("string")
print("sr5",len(sr5))

sr6 = pd.Series(["a","b","c",None]).astype("str")
print("sr6",len(sr6))

sr4 4
sr5 4
sr6 4


## loc 和 iloc

In [276]:
sr= pd.Series(["a","b","c",None]) 
sr.loc[0]
sr.iloc[0]

'a'

In [5]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


In [297]:
df.loc[0]  #第一个index=0行

text_content    this is a test line one
flag                                  0
full_text                           abc
Name: 0, dtype: object

In [298]:
df.iloc[0]

text_content    this is a test line one
flag                                  0
full_text                           abc
Name: 0, dtype: object

In [8]:
df.loc[:,"text_content"] # 选取一列,多个参数的时候别忘记逗号  等价于df["text_content"]

0                              this is a test line one
1    this is a test line two, this is a test line t...
2                                                  NaN
3                                                  NaN
Name: text_content, dtype: object

In [302]:
#df.loc[:,0]  # 用loc的是以后第一个参数必须是index 第二个必须是标签名，否则 KeyError

In [307]:
df.iloc[:,0] # iloc可以用数字，但是尽量避免

0                              this is a test line one
1    this is a test line two, this is a test line t...
2                                                  NaN
3                                                  NaN
Name: text_content, dtype: object

In [9]:
df.loc[:,["text_content","flag"]] # 用数字可以一次选择多个，标签是左右包含关系 等价于df[["text_content","flag"]]

Unnamed: 0,text_content,flag
0,this is a test line one,0
1,"this is a test line two, this is a test line t...",2
2,,3
3,,4


In [315]:
df.loc[[0,2]] # 第二个参数可省略

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
2,,3,dd


In [318]:
df.loc[0:2] #loc 如果使用区间都是包含的关系

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd


In [320]:
df.iloc[0:2] # iloc是不包含右侧的索引

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg


In [37]:
# 注意切片后，index不会自动更新，比如
df= pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_filter.csv")
df

Unnamed: 0,id,test_text,test_flag
0,a,text1,True
1,b,text2,False
2,c,text3,True
3,d,text4,True
4,e,text5,True
5,f,text6,True
6,g,text7,False
7,h,text8,True
8,i,text9,True


In [45]:
df.loc[2:6] = 1 # 会直接修改原来的值
df

Unnamed: 0,id,test_text,test_flag
0,a,text1,True
1,b,text2,False
2,1,1,1
3,1,1,1
4,1,1,1
5,1,1,1
6,1,1,1
7,h,text8,True
8,i,text9,True


In [46]:
df2 = df.loc[2:6]
df2 # 注意index 是 2-6

Unnamed: 0,id,test_text,test_flag
2,1,1,1
3,1,1,1
4,1,1,1
5,1,1,1
6,1,1,1


In [58]:
for idx,row in df2.iterrows():
    print(idx,row)

2
3
4
5
6


## 增加数据

### 赋值增加一列数据

#### 直接赋值

In [14]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df["add_col1"] = 1
df["add_col2"] = ["a","b","c","d"]
df["add_col3"] = pd.Series(["1","2","3","4"])
df["add_col4"]  = pd.NA
df

Unnamed: 0,text_content,flag,full_text,add_col1,add_col2,add_col3,add_col4
0,this is a test line one,0,abc,1,a,1,
1,"this is a test line two, this is a test line t...",2,efg,1,b,2,
2,,3,dd,1,c,3,
3,,4,cc,1,d,4,


#### 通过loc赋值

In [16]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df.loc[:, "add_col5"] = pd.NA
df

Unnamed: 0,text_content,flag,full_text,add_col5
0,this is a test line one,0,abc,
1,"this is a test line two, this is a test line t...",2,efg,
2,,3,dd,
3,,4,cc,


#### 通过insert

In [20]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df.insert(1,"add_col6",pd.NA)
df.insert(1,"add_col7",[1,3,4,6])
df

Unnamed: 0,text_content,add_col7,add_col6,flag,full_text
0,this is a test line one,1,,0,abc
1,"this is a test line two, this is a test line t...",3,,2,efg
2,,4,,3,dd
3,,6,,4,cc


### 添加一行数据

In [26]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df.loc[4]= ["text","flag","full_text"]
df.loc[3] = pd.NA  #!注意 对已存在index 会直接覆盖数据
df

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,,
4,text,flag,full_text


In [35]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df["text_content"][2] = "cover text"  # 直接这样赋值会出现SettingWithCopyWarning
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,cover text,3,dd
3,,4,cc


In [34]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df.loc[2,"text_content"] = "cover text"  # 这种不会有warnning  注意index不一定是所在的顺序
df

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,cover text,3,dd
3,,4,cc


### append增加一行或多行

#### append字典

In [66]:
task_df = pd.DataFrame()  
task_df = task_df.append({"id":1,"text":"test1"},ignore_index=True) # append 并返回最新的数据,注意赋值
task_df

Unnamed: 0,id,text
0,1.0,test1


In [68]:
task_df = pd.DataFrame()  
task_df = task_df.append({"id":1,"text":"test1"},ignore_index=True) 
task_df = task_df.append({"id":2,"text":"test2","add_col1":"x"},ignore_index=True) # pd 会自动扩展列，不存在的为空
task_df

Unnamed: 0,id,text,add_col1
0,1.0,test1,
1,2.0,test2,x


#### appen列表

In [75]:
task_df = pd.DataFrame()  
task_df = task_df.append([{"id":1,"text":"test1"},{"id":2,"text":"test2"}]) # 也可以接受数组
task_df = task_df.append([{"id":3,"text":"test1"},{"id":4,"text":"test2"}]) # 注意index会重复
task_df

Unnamed: 0,id,text
0,1,test1
1,2,test2
0,3,test1
1,4,test2


In [78]:
task_df = pd.DataFrame()  
task_df = task_df.append([{"id":1,"text":"test1"},{"id":2,"text":"test2"}],ignore_index=True) 
task_df = task_df.append([{"id":3,"text":"test1"},{"id":4,"text":"test2"}],ignore_index=True) # 如果不想index重复可以使用ignore_index=True
task_df

Unnamed: 0,id,text
0,1,test1
1,2,test2
2,3,test1
3,4,test2


#### append sr 或者df

In [81]:
task_df = pd.DataFrame()  
sr = pd.Series([1,2,3,4])
task_df = task_df.append(sr,ignore_index=True) 
task_df

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0


In [84]:
df1 = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df2 = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")

df = pd.DataFrame()  
df = df.append(df1,ignore_index=True)
df = df.append(df2,ignore_index=True)
df

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc
4,this is a test line one,0,abc
5,"this is a test line two, this is a test line t...",2,efg
6,,3,dd
7,,4,cc


## 修改列名和数据

In [85]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


### 直接赋值columns(不推荐)

In [87]:
df.columns = ["col1","col2","col3"] # 直接生效
df

Unnamed: 0,col1,col2,col3
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


### rename修改

In [93]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df.rename(columns={"text_content":"col1","f":"col2","full_text":"col3"}) # 不存在的列不会报错!

Unnamed: 0,col1,flag,col3
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


In [92]:
df # 原来的未变

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


In [95]:
df.rename(columns={"text_content":"col1","f":"col2","full_text":"col3"},inplace=True) # 不存在的列不会报错!
df ## 如果原地替换注意加上inplace参数，否则原来的df不会修改

Unnamed: 0,col1,flag,col3
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


### rename修改行索引

In [100]:
df.rename({0:"line1",1:"line2"},axis=0,inplace=True) # 1. 第一个参数不是columns.第二 axis=0
df

Unnamed: 0,col1,flag,col3
line1,this is a test line one,0,abc
line2,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


In [103]:
df["col1"]["line1"]

'this is a test line one'

In [102]:
df.loc["line1","col1"]

'this is a test line one'

### 使用loc修改数据

In [104]:
df.loc["line1","col1"] = "update_text"
df

Unnamed: 0,col1,flag,col3
line1,update_text,0,abc
line2,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


## 删除数据

In [111]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


In [114]:
df.columns
df.drop("flag",axis=1,inplace=True)
#df.drop("flag",axis=1,inplace=True)
#df

In [115]:
df

Unnamed: 0,text_content,full_text
0,this is a test line one,abc
1,"this is a test line two, this is a test line t...",efg
2,,dd
3,,cc


In [116]:
# 也可以删除多列
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df.drop(["flag","full_text"],axis=1,inplace=True)
df

Unnamed: 0,text_content
0,this is a test line one
1,"this is a test line two, this is a test line t..."
2,
3,


In [118]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df.drop(columns=["flag","full_text"],inplace=True)  # 如果传入columns,则不需要axis

In [127]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df.drop([0,2],inplace=True) #原地按index删除
# df.drop(index=[0,2],inplace=True) # 结果同上 
# df.drop(labels=[0,2],inplace=True) # 结果同上
df

Unnamed: 0,text_content,flag,full_text
1,"this is a test line two, this is a test line t...",2,efg
3,,4,cc


In [126]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df.drop(index=[0,2],inplace=True) #原地按index删除
# df.drop(index=[0,2],inplace=True) # 如果没找到会报错
df

Unnamed: 0,text_content,flag,full_text
1,"this is a test line two, this is a test line t...",2,efg
3,,4,cc


In [145]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df.rename({0:"line1",1:"line2"},axis=0,inplace=True)
df.drop(index =["line1",2],inplace=True)
df

Unnamed: 0,text_content,flag,full_text
line2,"this is a test line two, this is a test line t...",2,efg
3,,4,cc


In [146]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df.drop(index=df[df["flag"]>2].index,inplace=True)
df

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg


## 缺失值

In [150]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df.dropna(inplace=True) # 加了参数是原地修改 默认只要存在缺失就删除
df

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg


In [152]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df.dropna(inplace=True,how="all") # 全部为空才删除
df

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,,3,dd
3,,4,cc


In [154]:
df["col1"] = pd.NA
df

Unnamed: 0,text_content,flag,full_text,col1
0,this is a test line one,0,abc,
1,"this is a test line two, this is a test line t...",2,efg,
2,,3,dd,
3,,4,cc,


In [157]:
df.fillna(0,inplace=True) # 加inplace 原地修改
df

Unnamed: 0,text_content,flag,full_text,col1
0,this is a test line one,0,abc,0
1,"this is a test line two, this is a test line t...",2,efg,0
2,0,3,dd,0
3,0,4,cc,0


In [160]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/string_test.csv")
df["text_content"].fillna("empty",inplace=True)
df

Unnamed: 0,text_content,flag,full_text
0,this is a test line one,0,abc
1,"this is a test line two, this is a test line t...",2,efg
2,empty,3,dd
3,empty,4,cc


## 重复值

### duplicated

In [27]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_dup.csv")
df

Unnamed: 0,id,test_text,test_flag
0,a,text1,True
1,b,text2,False
2,a,text1,True
3,c,text1,True
4,d,text4,True
5,e,text5,True
6,f,text6,True
7,g,text7,False
8,h,text8,True
9,i,text9,True


In [7]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_dup.csv")
df.duplicated() # 返回每一列是否重复，完全重复

0     False
1     False
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
dtype: bool

In [10]:
df.duplicated(keep="last") 

0      True
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
dtype: bool

In [11]:
df.duplicated(subset=["test_text"])

0     False
1     False
2      True
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10    False
dtype: bool

In [15]:
df.duplicated(keep=False) # 重复的一条数据都不留
df.index

Int64Index([0, 1, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')

In [None]:
df.duplicated(keep="last")

In [28]:
df

Unnamed: 0,id,test_text,test_flag
0,a,text1,True
1,b,text2,False
2,a,text1,True
3,c,text1,True
4,d,text4,True
5,e,text5,True
6,f,text6,True
7,g,text7,False
8,h,text8,True
9,i,text9,True


In [29]:
df_dup = df.drop_duplicates(subset="test_text")
df_dup.index

df[~df.index.isin(df_dup.index)]

Unnamed: 0,id,test_text,test_flag
2,a,text1,True
3,c,text1,True


### unique

In [9]:
df["test_text"].unique()

array(['text1', 'text2', 'text4', 'text5', 'text6', 'text7', 'text8',
       'text9', 'text10'], dtype=object)

### drop_duplicates

In [14]:
df.drop_duplicates()
#df # index2被删除

Unnamed: 0,id,test_text,test_flag
0,a,text1,True
1,b,text2,False
3,c,text1,True
4,d,text4,True
5,e,text5,True
6,f,text6,True
7,g,text7,False
8,h,text8,True
9,i,text9,True
10,j,text10,


In [180]:
df.drop_duplicates("test_flag")# 如果要原地修改加inplace

Unnamed: 0,id,test_text,test_flag
0,a,text1,True
1,b,text2,False
10,j,text10,


In [182]:
df.drop_duplicates(["test_flag","test_text"])

Unnamed: 0,id,test_text,test_flag
0,a,text1,True
1,b,text2,False
4,d,text4,True
5,e,text5,True
6,f,text6,True
7,g,text7,False
8,h,text8,True
9,i,text9,True
10,j,text10,


In [184]:
df.drop_duplicates(["test_flag","test_text"],keep="last") # first last

Unnamed: 0,id,test_text,test_flag
1,b,text2,False
3,c,text1,True
4,d,text4,True
5,e,text5,True
6,f,text6,True
7,g,text7,False
8,h,text8,True
9,i,text9,True
10,j,text10,


## 索引重置

### reindex

In [187]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_dup.csv")
df.drop_duplicates(["test_flag","test_text"],keep="last",inplace=True) # first last
df


Unnamed: 0,id,test_text,test_flag
1,b,text2,False
3,c,text1,True
4,d,text4,True
5,e,text5,True
6,f,text6,True
7,g,text7,False
8,h,text8,True
9,i,text9,True
10,j,text10,


In [208]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_dup.csv")
df = df.reindex(["b1","b2",1])
df # 如果少了，

Unnamed: 0,id,test_text,test_flag
b1,,,
b2,,,
1,b,text2,False


In [211]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_dup.csv")
df

Unnamed: 0,id,test_text,test_flag
0,a,text1,True
1,b,text2,False
2,a,text1,True
3,c,text1,True
4,d,text4,True
5,e,text5,True
6,f,text6,True
7,g,text7,False
8,h,text8,True
9,i,text9,True


In [217]:
df = df.reindex([1,2,3,4,5,6,7,8,9,10,11]) #有点类似切片重组
df

Unnamed: 0,id,test_text,test_flag
1,,,
2,,,
3,,,
4,,,
5,,,
6,,,
7,,,
8,,,
9,,,
10,,,


### reset_index

In [196]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_dup.csv")
df.drop_duplicates(["test_flag","test_text"],keep="last",inplace=True) # first last
df.reset_index(inplace=False)

Unnamed: 0,index,id,test_text,test_flag
0,1,b,text2,False
1,3,c,text1,True
2,4,d,text4,True
3,5,e,text5,True
4,6,f,text6,True
5,7,g,text7,False
6,8,h,text8,True
7,9,i,text9,True
8,10,j,text10,


In [198]:
df.reset_index(drop=True,inplace=False) # drop=True 删除原来的index列，默认保留，注意该方法默认inplace=False

Unnamed: 0,id,test_text,test_flag
0,b,text2,False
1,c,text1,True
2,d,text4,True
3,e,text5,True
4,f,text6,True
5,g,text7,False
6,h,text8,True
7,i,text9,True
8,j,text10,


### set_index

In [224]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_dup.csv")
df = df.set_index("test_text")
df

Unnamed: 0_level_0,id,test_flag
test_text,Unnamed: 1_level_1,Unnamed: 2_level_1
text1,a,True
text2,b,False
text1,a,True
text1,c,True
text4,d,True
text5,e,True
text6,f,True
text7,g,False
text8,h,True
text9,i,True


In [227]:
df.loc["text1"]

Unnamed: 0_level_0,id,test_flag
test_text,Unnamed: 1_level_1,Unnamed: 2_level_1
text1,a,True
text1,a,True
text1,c,True


In [238]:
# 设置多列索引
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_dup.csv")
df = df.set_index(["test_text","id"])

In [241]:
df.loc["text1"].loc["a"]

Unnamed: 0_level_0,test_flag
id,Unnamed: 1_level_1
a,True
a,True


## 数据排序

In [242]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_dup.csv")
df

Unnamed: 0,id,test_text,test_flag
0,a,text1,True
1,b,text2,False
2,a,text1,True
3,c,text1,True
4,d,text4,True
5,e,text5,True
6,f,text6,True
7,g,text7,False
8,h,text8,True
9,i,text9,True


In [247]:
df.sort_values(by="id",inplace=True)
df

Unnamed: 0,id,test_text,test_flag
0,a,text1,True
2,a,text1,True
1,b,text2,False
3,c,text1,True
4,d,text4,True
5,e,text5,True
6,f,text6,True
7,g,text7,False
8,h,text8,True
9,i,text9,True


In [249]:
df.sort_values(by=["test_text","id"],inplace=True)
df

Unnamed: 0,id,test_text,test_flag
0,a,text1,True
2,a,text1,True
3,c,text1,True
10,j,text10,
1,b,text2,False
4,d,text4,True
5,e,text5,True
6,f,text6,True
7,g,text7,False
8,h,text8,True


In [253]:
df.sort_values(by=["test_text","id"],inplace=True,ascending=False) # 默认升序，降序False
df

Unnamed: 0,id,test_text,test_flag
9,i,text9,True
8,h,text8,True
7,g,text7,False
6,f,text6,True
5,e,text5,True
4,d,text4,True
1,b,text2,False
10,j,text10,
3,c,text1,True
0,a,text1,True


# 数据计算

## 计算函数

In [262]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_cal.csv")
df

Unnamed: 0,id,test_text,test_col
0,a,text1,1.2
1,b,text2,1.333
2,a,text1,1.4
3,c,text1,1.888888
4,d,text4,1.6666
5,e,text5,1.7777
6,f,text6,1.222
7,g,text7,1.0
8,h,text8,1.0
9,i,text9,1.0


In [263]:
df["test_col"].sum()

13.488188

In [265]:
df["test_col"].max()

1.888888

In [270]:
df["test_col"] = df["test_col"].round(decimals=2)
df

Unnamed: 0,id,test_text,test_col
0,a,text1,1.2
1,b,text2,1.33
2,a,text1,1.4
3,c,text1,1.89
4,d,text4,1.67
5,e,text5,1.78
6,f,text6,1.22
7,g,text7,1.0
8,h,text8,1.0
9,i,text9,1.0


## groupby

In [283]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_cal.csv")
df.groupby(by=["test_text"])


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f4defd13a90>

In [284]:
df.groupby(by=["test_text"]).sum() # ，自动sum可以增加的列

Unnamed: 0_level_0,test_col,col2
test_text,Unnamed: 1_level_1,Unnamed: 2_level_1
text1,4.488888,13
text10,0.0,1
text2,1.333,1
text4,1.6666,1
text5,1.7777,1
text6,1.222,1
text7,1.0,1
text8,1.0,1
text9,1.0,1


In [285]:
df.groupby(by=["test_text","id"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,test_col,col2
test_text,id,Unnamed: 2_level_1,Unnamed: 3_level_1
text1,a,2.6,12
text1,c,1.888888,1
text10,j,0.0,1
text2,b,1.333,1
text4,d,1.6666,1
text5,e,1.7777,1
text6,f,1.222,1
text7,g,1.0,1
text8,h,1.0,1
text9,i,1.0,1


In [287]:
for gp,newdf in df.groupby(by=["test_text"]):
    print(gp,type(newdf))

text1 <class 'pandas.core.frame.DataFrame'>
text10 <class 'pandas.core.frame.DataFrame'>
text2 <class 'pandas.core.frame.DataFrame'>
text4 <class 'pandas.core.frame.DataFrame'>
text5 <class 'pandas.core.frame.DataFrame'>
text6 <class 'pandas.core.frame.DataFrame'>
text7 <class 'pandas.core.frame.DataFrame'>
text8 <class 'pandas.core.frame.DataFrame'>
text9 <class 'pandas.core.frame.DataFrame'>


In [295]:
#agg聚合
df.groupby(by=["test_text"]).agg(["mean","sum"]) # 自动对可以计算的列进行 对应方法的计算

Unnamed: 0_level_0,test_col,test_col,col2,col2
Unnamed: 0_level_1,mean,sum,mean,sum
test_text,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
text1,1.496296,4.488888,4.333333,13
text10,,0.0,1.0,1
text2,1.333,1.333,1.0,1
text4,1.6666,1.6666,1.0,1
text5,1.7777,1.7777,1.0,1
text6,1.222,1.222,1.0,1
text7,1.0,1.0,1.0,1
text8,1.0,1.0,1.0,1
text9,1.0,1.0,1.0,1


## interrows

In [322]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_cal.csv")
for idx,row in df.iterrows():
    print(idx,type(row))

0 <class 'pandas.core.series.Series'>
1 <class 'pandas.core.series.Series'>
2 <class 'pandas.core.series.Series'>
3 <class 'pandas.core.series.Series'>
4 <class 'pandas.core.series.Series'>
5 <class 'pandas.core.series.Series'>
6 <class 'pandas.core.series.Series'>
7 <class 'pandas.core.series.Series'>
8 <class 'pandas.core.series.Series'>
9 <class 'pandas.core.series.Series'>
10 <class 'pandas.core.series.Series'>


## apply

In [323]:
df = pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_cal.csv")
df

Unnamed: 0,id,test_text,test_col,col2
0,a,text1,1.2,11
1,b,text2,1.333,1
2,a,text1,1.4,1
3,c,text1,1.888888,1
4,d,text4,1.6666,1
5,e,text5,1.7777,1
6,f,text6,1.222,1
7,g,text7,1.0,1
8,h,text8,1.0,1
9,i,text9,1.0,1


In [325]:
df["test_col"] = df["test_col"].apply(lambda x : x*2)
df

Unnamed: 0,id,test_text,test_col,col2
0,a,text1,4.8,11
1,b,text2,5.332,1
2,a,text1,5.6,1
3,c,text1,7.555552,1
4,d,text4,6.6664,1
5,e,text5,7.1108,1
6,f,text6,4.888,1
7,g,text7,4.0,1
8,h,text8,4.0,1
9,i,text9,4.0,1


In [353]:
df =  pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_cal.csv")

def cal1(row):
    return row["test_col"] * row["col2"]

df["new_col"] = df.apply(cal1,axis=1)
df["col2"] = df.apply(lambda row: row["test_col"] + row["col2"]  ,axis=1)
df

Unnamed: 0,id,test_text,test_col,col2,new_col
0,a,text1,1.2,12.2,13.2
1,b,text2,1.333,2.333,1.333
2,a,text1,1.4,2.4,1.4
3,c,text1,1.888888,2.888888,1.888888
4,d,text4,1.6666,2.6666,1.6666
5,e,text5,1.7777,2.7777,1.7777
6,f,text6,1.222,2.222,1.222
7,g,text7,1.0,2.0,1.0
8,h,text8,1.0,2.0,1.0
9,i,text9,1.0,2.0,1.0


In [370]:
df =  pd.read_csv("http://appen-pe.oss-cn-shanghai.aliyuncs.com/example_data/pandas_kt/test_cal.csv")
df = df[df["test_col"]>1.2]
df["col2"] = df.apply(lambda row: row["test_col"] + row["col2"]  ,axis=1)
df

Unnamed: 0,id,test_text,test_col,col2
1,b,text2,1.333,2.333
2,a,text1,1.4,2.4
3,c,text1,1.888888,2.888888
4,d,text4,1.6666,2.6666
5,e,text5,1.7777,2.7777
6,f,text6,1.222,2.222


## merge

In [377]:
df1 = pd.DataFrame([
    ["u1","xiaoming"],
    ["u3","xiaohong"]
],columns=['uid','name'])
df1

Unnamed: 0,uid,name
0,u1,xiaoming
1,u3,xiaohong


In [378]:
df2 = pd.DataFrame(
    data = [{"uid":"u1","score":1},{"uid":"u2","score":2}]
)
df2

Unnamed: 0,uid,score
0,u1,1
1,u2,2


In [388]:
df = pd.merge(df1,df2)
df

Unnamed: 0,uid,name,score
0,u1,xiaoming,1


In [382]:
df = pd.merge(df1,df2,on="uid",how="left")
df

Unnamed: 0,uid,name,score
0,u1,xiaoming,1.0
1,u3,xiaohong,


In [383]:
df = pd.merge(df1,df2,on="uid",how="right")
df

Unnamed: 0,uid,name,score
0,u1,xiaoming,1
1,u2,,2


In [384]:
df = pd.merge(df1,df2,on="uid",how="inner") # 默认inner
df

Unnamed: 0,uid,name,score
0,u1,xiaoming,1


In [385]:
df = pd.merge(df1,df2,on="uid",how="outer")
df

Unnamed: 0,uid,name,score
0,u1,xiaoming,1.0
1,u3,xiaohong,
2,u2,,2.0


In [411]:
df1.merge(df2,how="right") #df2默认是右表 # 不指定的情况下

Unnamed: 0,uid,name,score
0,u1,xiaoming,1
1,u2,,2


In [None]:
## 如果字段不一样可以通过left_on,right_on指定
df = pd.merge(df1,df2,left,how="outer")

In [389]:
df3 = pd.DataFrame([
    ["u1","xiaoming","M"],
    ["u3","xiaohong","M"]
],columns=['uid','name',"gender"])
df3

Unnamed: 0,uid,name,gender
0,u1,xiaoming,M
1,u3,xiaohong,M


In [390]:
df4 = pd.DataFrame(
    data = [{"uid":"u1","score":1,"gender":"F"},{"uid":"u2","score":2,"gender":"M"}]
)
df4

Unnamed: 0,uid,score,gender
0,u1,1,F
1,u2,2,M


In [395]:
pd.merge(df3,df4) # 没有on的时候 默认相同字段merge

Unnamed: 0,uid,name,gender,score


In [396]:
pd.merge(df3,df4,left_on="uid",right_on="uid")

Unnamed: 0,uid,name,gender_x,score,gender_y
0,u1,xiaoming,M,1,F


In [404]:
## 如果希望通过索引合并
pd.merge(df3,df4,left_index=True,right_index=True)  # 如果使用索引就使用xx_index=True

Unnamed: 0,uid_x,name,gender_x,uid_y,score,gender_y
0,u1,xiaoming,M,u1,1,F
1,u3,xiaohong,M,u2,2,M


In [407]:
df5 = pd.DataFrame(
    data = [{"uid":"u1","score":1,"gender":"F","idx":0},{"uid":"u2","score":2,"gender":"M","idx":1},{"uid":"u8","score":99,"gender":"M","idx":1}]
)
df5

Unnamed: 0,uid,score,gender,idx
0,u1,1,F,0
1,u2,2,M,1
2,u8,99,M,1


In [410]:
## 如果希望通过索引合并
pd.merge(df3,df5,left_index=True,right_on="idx")  # left_index和right_index必须都传

### 注意一对多的关系，可能会导致数据比之前多

Unnamed: 0,uid_x,name,gender_x,uid_y,score,gender_y,idx
0,u1,xiaoming,M,u1,1,F,0
1,u3,xiaohong,M,u2,2,M,1
2,u3,xiaohong,M,u8,99,M,1


## concat

In [412]:
dfc1 = pd.DataFrame([
    ["u1","xiaoming"],
    ["u3","xiaohong"]
],columns=['uid','name'])
dfc1

Unnamed: 0,uid,name
0,u1,xiaoming
1,u3,xiaohong


In [417]:
dfc2 = pd.DataFrame(
    data = [{"uid":"u1","score":1},{"uid":"u2","score":2},{"uid":"u8","score":99}]
)
dfc2

Unnamed: 0,uid,score
0,u1,1
1,u2,2
2,u8,99


In [428]:
pd.concat([dfc1,dfc2]) #默认是纵向合并

Unnamed: 0,uid,name,score
0,u1,xiaoming,
1,u3,xiaohong,
0,u1,,1.0
1,u2,,2.0
2,u8,,99.0


In [430]:
pd.concat([dfc1,dfc2],axis=1) # 按照index合并

Unnamed: 0,uid,name,uid.1,score
0,u1,xiaoming,u1,1
1,u3,xiaohong,u2,2
2,,,u8,99


In [429]:
pd.concat([dfc1,dfc2],axis=1,join="inner") # join 只有inner和outter,默认outter ,一般只有axis=1的时候才会用inner或者outer

Unnamed: 0,uid,name,uid.1,score
0,u1,xiaoming,u1,1
1,u3,xiaohong,u2,2


In [None]:
pd.concat([dfc1,dfc2]) #默认是纵向合并

## df.where

In [None]:
# df = df.where(df.notnull(), None) 将控制替换成None
# 注意如果换成None，类型先要转成object