In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# pandas series

In [2]:
s1 = pd.Series([1, 2, 3, 4])
s1

0    1
1    2
2    3
3    4
dtype: int64

In [3]:
s1.values

array([1, 2, 3, 4], dtype=int64)

In [4]:
s1.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
s2 = pd.Series(np.arange(10))
s2

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [6]:
s3 = pd.Series({'1':1, '2':2, '3':3})
s3

1    1
2    2
3    3
dtype: int64

In [7]:
s3.index

Index(['1', '2', '3'], dtype='object')

In [8]:
s3.values

array([1, 2, 3], dtype=int64)

In [9]:
s4 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s4

a    1
b    2
c    3
d    4
dtype: int64

In [10]:
s4.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [11]:
s4['a']

1

In [12]:
s4[s4>2]

c    3
d    4
dtype: int64

In [13]:
s5 = s4.to_dict()
s5

{'a': 1, 'b': 2, 'c': 3, 'd': 4}

In [15]:
index_1 = (['a', 'b', 'c', 'd', 'e'])
s6 = pd.Series(s5, index=index_1)
s6

a    1.0
b    2.0
c    3.0
d    4.0
e    NaN
dtype: float64

In [16]:
pd.isnull(s6)

a    False
b    False
c    False
d    False
e     True
dtype: bool

In [17]:
pd.notnull(s6)

a     True
b     True
c     True
d     True
e    False
dtype: bool

In [18]:
s6.name = 'demo'
s6

a    1.0
b    2.0
c    3.0
d    4.0
e    NaN
Name: demo, dtype: float64

In [19]:
s6.index.name = 'index_demo'
s6.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object', name='index_demo')

# pandas dataframe

In [21]:
import webbrowser
link = 'https:/www.tiobe.com/tiobe-index/'
webbrowser.open(link)

True

In [26]:
df = pd.read_clipboard()
df

Unnamed: 0,Mar 2021,Mar 2020,Change,Programming Language,Ratings,Change.1
0,1,2,change,C,15.33%,-1.00%
1,2,1,change,Java,10.45%,-7.33%
2,3,3,,Python,10.31%,+0.20%
3,4,4,,C++,6.52%,-0.27%
4,5,5,,C#,4.97%,-0.35%
5,6,6,,Visual Basic,4.85%,-0.40%
6,7,7,,JavaScript,2.11%,+0.06%
7,8,8,,PHP,2.07%,+0.05%
8,9,12,change,Assembly language,1.97%,+0.72%
9,10,9,change,SQL,1.87%,+0.03%


In [27]:
type(df)

pandas.core.frame.DataFrame

In [28]:
df.columns

Index(['Mar 2021', 'Mar 2020', 'Change', 'Programming Language', 'Ratings',
       'Change.1'],
      dtype='object')

In [29]:
df.Ratings

0    15.33%
1    10.45%
2    10.31%
3     6.52%
4     4.97%
5     4.85%
6     2.11%
7     2.07%
8     1.97%
9     1.87%
Name: Ratings, dtype: object

In [31]:
df["Mar 2020"]

0     2
1     1
2     3
3     4
4     5
5     6
6     7
7     8
8    12
9     9
Name: Mar 2020, dtype: int64

In [30]:
df_new = DataFrame(df, columns=["Programming Language", "Mar 2021"])
df_new

Unnamed: 0,Programming Language,Mar 2021
0,C,1
1,Java,2
2,Python,3
3,C++,4
4,C#,5
5,Visual Basic,6
6,JavaScript,7
7,PHP,8
8,Assembly language,9
9,SQL,10


In [33]:
df_new_2 = DataFrame(df, columns=["Programming Language", "Mar 2021", "Mar 2020"])
df_new_2

Unnamed: 0,Programming Language,Mar 2021,Mar 2020
0,C,1,2
1,Java,2,1
2,Python,3,3
3,C++,4,4
4,C#,5,5
5,Visual Basic,6,6
6,JavaScript,7,7
7,PHP,8,8
8,Assembly language,9,12
9,SQL,10,9


In [35]:
df_new_2["Mar 2020"] = range(1,11)
df_new_2

Unnamed: 0,Programming Language,Mar 2021,Mar 2020
0,C,1,1
1,Java,2,2
2,Python,3,3
3,C++,4,4
4,C#,5,5
5,Visual Basic,6,6
6,JavaScript,7,7
7,PHP,8,8
8,Assembly language,9,9
9,SQL,10,10


In [36]:
df_new_2["Mar 2020"] = np.arange(1,11)
df_new_2

Unnamed: 0,Programming Language,Mar 2021,Mar 2020
0,C,1,1
1,Java,2,2
2,Python,3,3
3,C++,4,4
4,C#,5,5
5,Visual Basic,6,6
6,JavaScript,7,7
7,PHP,8,8
8,Assembly language,9,9
9,SQL,10,10


In [37]:
df_new_2["Mar 2020"] = pd.Series(np.arange(1,11))
df_new_2

Unnamed: 0,Programming Language,Mar 2021,Mar 2020
0,C,1,1
1,Java,2,2
2,Python,3,3
3,C++,4,4
4,C#,5,5
5,Visual Basic,6,6
6,JavaScript,7,7
7,PHP,8,8
8,Assembly language,9,9
9,SQL,10,10


In [38]:
df_new_2["Mar 2020"] = pd.Series([100, 200], index=[1, 2])
df_new_2

Unnamed: 0,Programming Language,Mar 2021,Mar 2020
0,C,1,
1,Java,2,100.0
2,Python,3,200.0
3,C++,4,
4,C#,5,
5,Visual Basic,6,
6,JavaScript,7,
7,PHP,8,
8,Assembly language,9,
9,SQL,10,


# 深入理解series和dataframe

In [41]:
data = {
    'Country': ['China', 'India', 'Brazil'],
    'Capital': ['Beijing', 'New Delhi', 'Brasilia'],
    'Population': [1432732201, 1303171635, 207847528]}

In [2]:
data = {
    'Country':['Bel', 'Ind', 'Bra'],
    'Capital':['Bru', 'New', 'Bra'],
    'Popualtion':[111, 222, 333]}

In [3]:
s1 = pd.Series(data['Country'])
s1

0    Bel
1    Ind
2    Bra
dtype: object

In [4]:
s1.values

array(['Bel', 'Ind', 'Bra'], dtype=object)

In [5]:
s1.index

RangeIndex(start=0, stop=3, step=1)

In [6]:
df1 = pd.DataFrame(data)
df1

Unnamed: 0,Country,Capital,Popualtion
0,Bel,Bru,111
1,Ind,New,222
2,Bra,Bra,333


In [8]:
cou = df1['Country']
type(cou)

pandas.core.series.Series

In [12]:
for row in df1.iteritems():
    print(row[0]),print('****'), print(row[1])

Country
****
0    Bel
1    Ind
2    Bra
Name: Country, dtype: object
Capital
****
0    Bru
1    New
2    Bra
Name: Capital, dtype: object
Popualtion
****
0    111
1    222
2    333
Name: Popualtion, dtype: int64


In [13]:
data

{'Country': ['Bel', 'Ind', 'Bra'],
 'Capital': ['Bru', 'New', 'Bra'],
 'Popualtion': [111, 222, 333]}

In [18]:
s1 = pd.Series(data['Country'])
s2 = pd.Series(data['Capital'])
s3 = pd.Series(data['Popualtion'])

df_new = pd.DataFrame([s1, s2, s3], index=['Country', 'Capital', 'Popualtion']).T
df_new

Unnamed: 0,Country,Capital,Popualtion
0,Bel,Bru,111
1,Ind,New,222
2,Bra,Bra,333


In [17]:
df1

Unnamed: 0,Country,Capital,Popualtion
0,Bel,Bru,111
1,Ind,New,222
2,Bra,Bra,333


In [21]:
link = "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html"
webbrowser.open(link)

True

In [23]:
df1 = pd.read_clipboard()
df1

Unnamed: 0,开始标签,元素内容,结束标签
0,<p>,This is a paragraph,</p>
1,"<a href=""default.htm"" >",This is a link,</a>
2,<br />,,


In [24]:
df1.to_clipboard()

In [27]:
df1.to_csv('df1.csv', index=False)

In [31]:
!dir

 驱动器 D 中的卷是 Data
 卷的序列号是 18EC-1B72

 D:\StudyForConda\jupyter\demo_project\demo\just kidding 的目录

2021/03/08  20:19    <DIR>          .
2021/03/08  20:19    <DIR>          ..
2021/03/08  20:16    <DIR>          .ipynb_checkpoints
2020/10/11  17:09            32,882 01 bool索引和缺失数据的处理.ipynb
2020/10/13  23:08            58,543 02 数据合并之join.ipynb
2020/10/14  22:10             3,283 03 时间序列.ipynb
2021/03/08  20:18               129 df1.csv
2020/10/24  14:29            19,265 ndarray 多维数组(N Dimension Array).ipynb
2020/10/16  23:27            11,744 Untitled.ipynb
2020/10/16  23:17             3,048 Untitled1.ipynb
2020/10/16  23:35            10,701 Untitled2.ipynb
2020/10/11  13:47            24,482 Untitled3.ipynb
2021/03/08  20:19            12,601 Untitled4.ipynb
2020/10/17  11:05             2,211 归并排序.ipynb
2020/10/24  20:11            32,676 数据分析工具Pandas.ipynb
              12 个文件        211,565 字节
               3 个目录 228,645,076,992 可用字节


In [28]:
!more df1.csv

寮�濮嬫爣绛?鍏冪礌鍐呭��,缁撴潫鏍囩��
<p>,This is a paragraph,</p>
"<a href=""default.htm"" >",This is a link,</a>
<br />,,


In [33]:
df2 = pd.read_csv('df1.csv')
df2

Unnamed: 0,开始标签,元素内容,结束标签
0,<p>,This is a paragraph,</p>
1,"<a href=""default.htm"" >",This is a link,</a>
2,<br />,,


In [37]:
imdb = pd.read_csv("D:/资料/机器学习/数据资料/imdb/movie_metadata.csv")
imdb.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [36]:
imdb.shape

(5043, 28)

In [38]:
sub_df = imdb[['director_name', 'movie_title', 'imdb_score']]
sub_df.head()

Unnamed: 0,director_name,movie_title,imdb_score
0,James Cameron,Avatar,7.9
1,Gore Verbinski,Pirates of the Caribbean: At World's End,7.1
2,Sam Mendes,Spectre,6.8
3,Christopher Nolan,The Dark Knight Rises,8.5
4,Doug Walker,Star Wars: Episode VII - The Force Awakens ...,7.1


In [40]:
sub_df.iloc[10:20,:]

Unnamed: 0,director_name,movie_title,imdb_score
10,Zack Snyder,Batman v Superman: Dawn of Justice,6.9
11,Bryan Singer,Superman Returns,6.1
12,Marc Forster,Quantum of Solace,6.7
13,Gore Verbinski,Pirates of the Caribbean: Dead Man's Chest,7.3
14,Gore Verbinski,The Lone Ranger,6.5
15,Zack Snyder,Man of Steel,7.2
16,Andrew Adamson,The Chronicles of Narnia: Prince Caspian,6.6
17,Joss Whedon,The Avengers,8.1
18,Rob Marshall,Pirates of the Caribbean: On Stranger Tides,6.7
19,Barry Sonnenfeld,Men in Black 3,6.8


In [43]:
tem_df = sub_df.iloc[10:20,:2]
tem_df

Unnamed: 0,director_name,movie_title
10,Zack Snyder,Batman v Superman: Dawn of Justice
11,Bryan Singer,Superman Returns
12,Marc Forster,Quantum of Solace
13,Gore Verbinski,Pirates of the Caribbean: Dead Man's Chest
14,Gore Verbinski,The Lone Ranger
15,Zack Snyder,Man of Steel
16,Andrew Adamson,The Chronicles of Narnia: Prince Caspian
17,Joss Whedon,The Avengers
18,Rob Marshall,Pirates of the Caribbean: On Stranger Tides
19,Barry Sonnenfeld,Men in Black 3


In [48]:
tem_df.iloc[2:4,:]

Unnamed: 0,director_name,movie_title
12,Marc Forster,Quantum of Solace
13,Gore Verbinski,Pirates of the Caribbean: Dead Man's Chest


In [47]:
tem_df.loc[15:17,:]

Unnamed: 0,director_name,movie_title
15,Zack Snyder,Man of Steel
16,Andrew Adamson,The Chronicles of Narnia: Prince Caspian
17,Joss Whedon,The Avengers


# reindexing

In [2]:
s1 = Series([1, 2, 3, 4], index=['A', 'B', 'C', 'D'])

In [3]:
s1

A    1
B    2
C    3
D    4
dtype: int64

In [4]:
s1.reindex(index=['A', 'B', 'C', 'D', 'E'])

A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
dtype: float64

In [5]:
s1.reindex(index=['A', 'B', 'C', 'D', 'E'], fill_value=10)

A     1
B     2
C     3
D     4
E    10
dtype: int64

# NaN-means not a number

In [2]:
n = np.nan

In [3]:
type(n)

float

In [4]:
m = 1
m + n

nan

In [7]:
s1 = Series([1, 2, np.nan, 3, 4], index=['A', 'B', 'C', 'D', 'E'])
s1

A    1.0
B    2.0
C    NaN
D    3.0
E    4.0
dtype: float64

In [8]:
s1.dropna()

A    1.0
B    2.0
D    3.0
E    4.0
dtype: float64

In [10]:
dataframe = DataFrame([[1, 2, 3],[np.nan, 5, 6],[7, np.nan, 9],[np.nan, np.nan, np.nan]])
dataframe

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [11]:
dataframe.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,True,False,False
2,False,True,False
3,True,True,True


In [12]:
dataframe.notnull()

Unnamed: 0,0,1,2
0,True,True,True
1,False,True,True
2,True,False,True
3,False,False,False


In [13]:
dataframe

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [14]:
df1 = dataframe.dropna(axis=0)
df1

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [15]:
df1 = dataframe.dropna(axis=1)
df1

0
1
2
3


In [16]:
df1 = dataframe.dropna(axis=0, how='any')
df1

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [18]:
df1 = dataframe.dropna(axis=0, how='all')
df1

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0


# 多级index

In [20]:
s1 = Series(np.random.randn(6), index=[['1', '1', '1', '2', '2', '2'],['a', 'b', 'c', 'a', 'b', 'c']])
s1

1  a   -1.434306
   b    0.395669
   c    0.127020
2  a    0.468637
   b    1.374423
   c    1.250490
dtype: float64

In [21]:
s1['1']

a   -1.434306
b    0.395669
c    0.127020
dtype: float64

In [22]:
s1['2']

a    0.468637
b    1.374423
c    1.250490
dtype: float64

In [24]:
s1['1']['a']

-1.434305633184424

In [25]:
s1[:,'a']

1   -1.434306
2    0.468637
dtype: float64

In [27]:
df1 = s1.unstack()
df1

Unnamed: 0,a,b,c
1,-1.434306,0.395669,0.12702
2,0.468637,1.374423,1.25049


In [28]:
df2 = DataFrame([s1['1'], s1['2']])
df2

Unnamed: 0,a,b,c
0,-1.434306,0.395669,0.12702
1,0.468637,1.374423,1.25049


In [29]:
df3 = DataFrame(df1)
df3

Unnamed: 0,a,b,c
1,-1.434306,0.395669,0.12702
2,0.468637,1.374423,1.25049


In [31]:
s2 = df1.T.unstack()
s2

1  a   -1.434306
   b    0.395669
   c    0.127020
2  a    0.468637
   b    1.374423
   c    1.250490
dtype: float64

In [34]:
df = DataFrame(np.arange(16).reshape(4,-1), 
               index=[['a', 'a', 'b', 'b'],[1, 2, 1, 2]],
               columns=[['bj', 'bj', 'sh', 'gz'], [8, 9, 8, 8]])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,bj,bj,sh,gz
Unnamed: 0_level_1,Unnamed: 1_level_1,8,9,8,8
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [35]:
df['bj']

Unnamed: 0,Unnamed: 1,8,9
a,1,0,1
a,2,4,5
b,1,8,9
b,2,12,13


In [39]:
df['bj'][8]

a  1     0
   2     4
b  1     8
   2    12
Name: 8, dtype: int32

# mapping和replace

In [46]:
df1 = DataFrame({"城市":["北京","上海","广州"], "人口":[3000, 2000, 1000]}, index=["A", "B", "C"])
df1

Unnamed: 0,城市,人口
A,北京,3000
B,上海,2000
C,广州,1000


In [47]:
df1["GDP"] = Series([3000, 2000, 1000])
df1

Unnamed: 0,城市,人口,GDP
A,北京,3000,
B,上海,2000,
C,广州,1000,


In [48]:
gdp_map = {"北京":3000, "上海":2000, "广州":1000}
gdp_map

{'北京': 3000, '上海': 2000, '广州': 1000}

In [49]:
df1 = DataFrame({"城市":["北京","上海","广州"], "人口":[3000, 2000, 1000]}, index=["A", "B", "C"])


In [50]:
df1

Unnamed: 0,城市,人口
A,北京,3000
B,上海,2000
C,广州,1000


In [51]:
df1["GDP"] = df1["城市"].map(gdp_map)
df1

Unnamed: 0,城市,人口,GDP
A,北京,3000,3000
B,上海,2000,2000
C,广州,1000,1000


In [52]:
s1 = Series(np.arange(10))
s1

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [53]:
s1.replace(1, np.nan)

0    0.0
1    NaN
2    2.0
3    3.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [54]:
s1.replace({1: np.nan})

0    0.0
1    NaN
2    2.0
3    3.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [55]:
s1.replace([1, 2, 3], [10, 20 ,30])

0     0
1    10
2    20
3    30
4     4
5     5
6     6
7     7
8     8
9     9
dtype: int64