In [None]:
import pandas as pd

In [None]:
df = pd.read_excel("Data Model.xlsx")
df = pd.read_excel("Data Model.xlsx"，header=2)            # header=2 跳过前2行数据

df = pd.read_excel("Data Model.xlsx", header=None)         # header=None 表示没有列名
df.columns = ['ID', 'Type', 'Title', 'Name']               # 为没有列名的数据添加列名 

df.set_index('ID', inplace=True)                           # 将‘ID’列设置为索引index

In [None]:
df = pd.read_excel("Data Model.xlsx", index_col='ID')      # 读取Excel时，指定‘ID’列为索引
df = pd.DataFrame({'ID' : [1, 2, 3], 'Name' : ['Tim', 'Victor', 'Nick']})

df = df.set_index('ID')             # 将'ID'列设置为索引
df.to_excel("output.xlsx")

In [None]:
books = pd.read_excel('data/Books.xlsx', index_col='ID')

# 计算价格：方法一
books['Price'] = books['ListPrice'] * books['Discount']
print(books)

# 计算价格：方法二
for i in books.index:
    books['Price'].at[i] = books['ListPrice'].at[i] * books['Discount'].at[i]
print(books)    
    
# 涨价: 方法一
books['ListPrice'] = books['ListPrice'] + 2
print(books)

# 涨价: 方法二
books['ListPrice'] = books['ListPrice'].apply(lambda x: x+2)
print(books)

In [None]:
products = pd.read_excel('data/List.xlsx', index_col='ID')

# 基于指定列从低到高排列
products.sort_values(by='Price', inplace=True)
#print(products)

# 基于指定列从高到低排列
products.sort_values(by='Price', inplace=True, ascending=False)
#print(products)

# 多列排序
products.sort_values(by=['Worthy', 'Price'], inplace=True, ascending=[True, False])
print(products)

In [None]:
students = pd.read_excel('data/Students1.xlsx', index_col='ID')

# 筛选数据
students = students.loc[students['Age'].apply(lambda x: 18<=x<30)]
print(students)

# 级联筛选数据
students = students.loc[students['Age'].apply(lambda x: 18<=x<30)].loc[students['Score'].apply(lambda x: 85<=x<=100)]
print(students)

# 属性简写
students = students.loc[students.Age.apply(lambda x: 18<=x<30)].loc[students.Score.apply(lambda x: 85<=x<=100)]
print(students)

In [None]:
students = pd.read_excel('data/Student_score.xlsx', sheet_name='Students')
scores = pd.read_excel('data/Student_score.xlsx', sheet_name='Scores')

# merge 默认是内联模式(inner)
tables = students.merge(scores, on='ID')
#print(tables)

# merge 左外连接
tables = students.merge(scores, how='left', on='ID').fillna(0)
#tables = students.merge(scores, how='left', left_on='ID', right_on='ID').fillna(0)
tables.Score = tables.Score.astype(int)
#print(tables)

# merge 右外连接
tables = students.merge(scores, how='right', on='ID')
#tables = students.merge(scores, how='left', left_on='ID', right_on='ID')
tables.Score = tables.Score.astype(int)
#print(tables)


# join 连接 
students = pd.read_excel('data/Student_score.xlsx', sheet_name='Students', index_col='ID')
scores = pd.read_excel('data/Student_score.xlsx', sheet_name='Scores', index_col='ID')
tables = students.join(scores, how='left').fillna(0)
tables.Score = tables.Score.astype(int)
#print(tables)

In [None]:
def score_validation(row):
    if not 0 <= row.Score <= 100:
        print(f'#{row.ID}\tstudent {row.Name} has an invalid score "{row.Score}"')

students = pd.read_excel('data/Students2.xlsx')

# 逐行扫描过滤
students.apply(score_validation, axis=1)    # axis=0: 列方向； axis=1: 行方向

In [None]:
employees = pd.read_excel('data/Employees.xlsx', index_col='ID')

# 将一列数据分割成两列
df = employees['Full Name'].str.split(expand=True)
#print(df)

# 新增列
employees['First Name'] = df[0]
employees['Last Name'] = df[1].str.upper()
print(employees)

In [None]:
students = pd.read_excel('data/Students3.xlsx', index_col='ID')

# 基于列提取子集
temp = students[['Test_1', 'Test_2', 'Test_3']]
#print(temp)

# 基于行计算总和 / 平均值
row_sum = temp.sum(axis=1)
#print(row_sum)
row_mean = temp.mean(axis=1)
#print(row_mean)

students['Total'] = row_sum
students['Average'] = row_mean
#print(students)

# 基于列计算平均值
col_mean = students[['Test_1', 'Test_2', 'Test_3', 'Total', 'Average']].mean()
col_mean['Name'] = 'Summary'
students = students.append(col_mean, ignore_index=True)
print(students)

In [None]:
students = pd.read_excel('data/Students_Duplicates.xlsx')

# 消除重复数据
#students.drop_duplicates(subset=['Name'], inplace=True)
#print(students)

# 去重时, 保留最后出现的重复数据
#students.drop_duplicates(subset=['Name'], inplace=True, keep='last')
#print(students)

# 提取重复数据
dupe = students.duplicated(subset=['Name'])
#print(dupe)
#print(dupe.any())
 
dupe = dupe[dupe == True]   # 简写: dupe = dupe[dupe]
print(dupe)
print(dupe.index)

print(students.iloc[dupe.index])